+ MODEL_PATH=/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct
+ SYSTEM_PROMPT='You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.'
+ python3 -m verl.trainer.main config=examples/config.yaml data.train_files=leonardPKU/GEOQA_8K_R1V@train data.val_files=leonardPKU/GEOQA_8K_R1V@test data.image_key=images 'data.system_prompt=You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.' worker.actor.model.model_path=/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct worker.rollout.tensor_parallel_size=1 worker.rollout.enable_chunked_prefill=false trainer.experiment_name=qwen2_5_vl_3b_GEOQA_8K_R1V worker.reward.compute_score=r1v trainer.n_gpus_per_node=2
INFO 04-07 21:24:40 [__init__.py:239] Automatically detected platform cuda.
2025-04-07 21:24:44,643 INFO worker.py:1843 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
[36m(pid=3309020)[0m INFO 04-07 21:24:50 [__init__.py:239] Automatically detected platform cuda.
[36m(Runner pid=3309020)[0m {
[36m(Runner pid=3309020)[0m "data": {
[36m(Runner pid=3309020)[0m "train_files": "leonardPKU/GEOQA_8K_R1V@train",
[36m(Runner pid=3309020)[0m "val_files": "leonardPKU/GEOQA_8K_R1V@test",
[36m(Runner pid=3309020)[0m "prompt_key": "problem",
[36m(Runner pid=3309020)[0m "answer_key": "answer",
[36m(Runner pid=3309020)[0m "image_key": "images",
[36m(Runner pid=3309020)[0m "max_prompt_length": 2048,
[36m(Runner pid=3309020)[0m "max_response_length": 5500,
[36m(Runner pid=3309020)[0m "rollout_batch_size": 512,
[36m(Runner pid=3309020)[0m "val_batch_size": -1,
[36m(Runner pid=3309020)[0m "system_prompt": "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.",
[36m(Runner pid=3309020)[0m "shuffle": true,
[36m(Runner pid=3309020)[0m "seed": 1,
[36m(Runner pid=3309020)[0m "max_pixels": 4194304,
[36m(Runner pid=3309020)[0m "min_pixels": 262144
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "worker": {
[36m(Runner pid=3309020)[0m "hybrid_engine": true,
[36m(Runner pid=3309020)[0m "actor": {
[36m(Runner pid=3309020)[0m "strategy": "fsdp",
[36m(Runner pid=3309020)[0m "global_batch_size": 128,
[36m(Runner pid=3309020)[0m "micro_batch_size_per_device_for_update": 4,
[36m(Runner pid=3309020)[0m "micro_batch_size_per_device_for_experience": 16,
[36m(Runner pid=3309020)[0m "max_grad_norm": 1.0,
[36m(Runner pid=3309020)[0m "clip_ratio": 0.2,
[36m(Runner pid=3309020)[0m "ppo_epochs": 1,
[36m(Runner pid=3309020)[0m "padding_free": true,
[36m(Runner pid=3309020)[0m "ulysses_sequence_parallel_size": 1,
[36m(Runner pid=3309020)[0m "use_torch_compile": true,
[36m(Runner pid=3309020)[0m "model": {
[36m(Runner pid=3309020)[0m "model_path": "/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct",
[36m(Runner pid=3309020)[0m "tokenizer_path": "/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct",
[36m(Runner pid=3309020)[0m "override_config": {},
[36m(Runner pid=3309020)[0m "enable_gradient_checkpointing": true,
[36m(Runner pid=3309020)[0m "trust_remote_code": false,
[36m(Runner pid=3309020)[0m "freeze_vision_tower": false
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "optim": {
[36m(Runner pid=3309020)[0m "lr": 1e-06,
[36m(Runner pid=3309020)[0m "betas": [
[36m(Runner pid=3309020)[0m 0.9,
[36m(Runner pid=3309020)[0m 0.999
[36m(Runner pid=3309020)[0m ],
[36m(Runner pid=3309020)[0m "weight_decay": 0.01,
[36m(Runner pid=3309020)[0m "strategy": "adamw",
[36m(Runner pid=3309020)[0m "lr_warmup_ratio": 0.0,
[36m(Runner pid=3309020)[0m "min_lr_ratio": null,
[36m(Runner pid=3309020)[0m "warmup_style": "constant",
[36m(Runner pid=3309020)[0m "training_steps": -1
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "fsdp": {
[36m(Runner pid=3309020)[0m "enable_full_shard": true,
[36m(Runner pid=3309020)[0m "enable_cpu_offload": false,
[36m(Runner pid=3309020)[0m "enable_rank0_init": true,
[36m(Runner pid=3309020)[0m "use_orig_params": false,
[36m(Runner pid=3309020)[0m "torch_dtype": null,
[36m(Runner pid=3309020)[0m "fsdp_size": -1,
[36m(Runner pid=3309020)[0m "mp_param_dtype": "bf16",
[36m(Runner pid=3309020)[0m "mp_reduce_dtype": "fp32",
[36m(Runner pid=3309020)[0m "mp_buffer_dtype": "fp32"
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "offload": {
[36m(Runner pid=3309020)[0m "offload_params": true,
[36m(Runner pid=3309020)[0m "offload_optimizer": true
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "global_batch_size_per_device": -1,
[36m(Runner pid=3309020)[0m "disable_kl": false,
[36m(Runner pid=3309020)[0m "use_kl_loss": true,
[36m(Runner pid=3309020)[0m "kl_penalty": "low_var_kl",
[36m(Runner pid=3309020)[0m "kl_coef": 0.01
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "critic": {
[36m(Runner pid=3309020)[0m "strategy": "fsdp",
[36m(Runner pid=3309020)[0m "global_batch_size": 256,
[36m(Runner pid=3309020)[0m "micro_batch_size_per_device_for_update": 4,
[36m(Runner pid=3309020)[0m "micro_batch_size_per_device_for_experience": 16,
[36m(Runner pid=3309020)[0m "max_grad_norm": 1.0,
[36m(Runner pid=3309020)[0m "cliprange_value": 0.5,
[36m(Runner pid=3309020)[0m "ppo_epochs": 1,
[36m(Runner pid=3309020)[0m "padding_free": false,
[36m(Runner pid=3309020)[0m "ulysses_sequence_parallel_size": 1,
[36m(Runner pid=3309020)[0m "model": {
[36m(Runner pid=3309020)[0m "model_path": null,
[36m(Runner pid=3309020)[0m "tokenizer_path": null,
[36m(Runner pid=3309020)[0m "override_config": {},
[36m(Runner pid=3309020)[0m "enable_gradient_checkpointing": true,
[36m(Runner pid=3309020)[0m "trust_remote_code": true,
[36m(Runner pid=3309020)[0m "freeze_vision_tower": false
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "optim": {
[36m(Runner pid=3309020)[0m "lr": 1e-06,
[36m(Runner pid=3309020)[0m "betas": [
[36m(Runner pid=3309020)[0m 0.9,
[36m(Runner pid=3309020)[0m 0.999
[36m(Runner pid=3309020)[0m ],
[36m(Runner pid=3309020)[0m "weight_decay": 0.01,
[36m(Runner pid=3309020)[0m "strategy": "adamw",
[36m(Runner pid=3309020)[0m "lr_warmup_ratio": 0.0,
[36m(Runner pid=3309020)[0m "min_lr_ratio": null,
[36m(Runner pid=3309020)[0m "warmup_style": "constant",
[36m(Runner pid=3309020)[0m "training_steps": -1
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "fsdp": {
[36m(Runner pid=3309020)[0m "enable_full_shard": true,
[36m(Runner pid=3309020)[0m "enable_cpu_offload": false,
[36m(Runner pid=3309020)[0m "enable_rank0_init": false,
[36m(Runner pid=3309020)[0m "use_orig_params": false,
[36m(Runner pid=3309020)[0m "torch_dtype": null,
[36m(Runner pid=3309020)[0m "fsdp_size": -1,
[36m(Runner pid=3309020)[0m "mp_param_dtype": "bf16",
[36m(Runner pid=3309020)[0m "mp_reduce_dtype": "fp32",
[36m(Runner pid=3309020)[0m "mp_buffer_dtype": "fp32"
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "offload": {
[36m(Runner pid=3309020)[0m "offload_params": false,
[36m(Runner pid=3309020)[0m "offload_optimizer": false
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "global_batch_size_per_device": -1
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "ref": {
[36m(Runner pid=3309020)[0m "strategy": "fsdp",
[36m(Runner pid=3309020)[0m "fsdp": {
[36m(Runner pid=3309020)[0m "enable_full_shard": true,
[36m(Runner pid=3309020)[0m "enable_cpu_offload": true,
[36m(Runner pid=3309020)[0m "enable_rank0_init": true,
[36m(Runner pid=3309020)[0m "use_orig_params": false,
[36m(Runner pid=3309020)[0m "torch_dtype": null,
[36m(Runner pid=3309020)[0m "fsdp_size": -1,
[36m(Runner pid=3309020)[0m "mp_param_dtype": "bf16",
[36m(Runner pid=3309020)[0m "mp_reduce_dtype": "fp32",
[36m(Runner pid=3309020)[0m "mp_buffer_dtype": "fp32"
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m DeprecationWarning: `ray.state.available_resources_per_node` is a private attribute and access will be removed in a future Ray version.
[36m(Runner pid=3309020)[0m "offload": {
[36m(Runner pid=3309020)[0m "offload_params": false,
[36m(Runner pid=3309020)[0m "offload_optimizer": false
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "micro_batch_size_per_device_for_experience": 16,
[36m(Runner pid=3309020)[0m "padding_free": true,
[36m(Runner pid=3309020)[0m "ulysses_sequence_parallel_size": 1,
[36m(Runner pid=3309020)[0m "use_torch_compile": true
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "reward": {
[36m(Runner pid=3309020)[0m "reward_type": "function",
[36m(Runner pid=3309020)[0m "compute_score": "r1v"
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "rollout": {
[36m(Runner pid=3309020)[0m "name": "vllm",
[36m(Runner pid=3309020)[0m "n": 5,
[36m(Runner pid=3309020)[0m "temperature": 1.0,
[36m(Runner pid=3309020)[0m "top_p": 1.0,
[36m(Runner pid=3309020)[0m "top_k": -1,
[36m(Runner pid=3309020)[0m "limit_images": 0,
[36m(Runner pid=3309020)[0m "dtype": "bf16",
[36m(Runner pid=3309020)[0m "gpu_memory_utilization": 0.6,
[36m(Runner pid=3309020)[0m "ignore_eos": false,
[36m(Runner pid=3309020)[0m "enforce_eager": false,
[36m(Runner pid=3309020)[0m "free_cache_engine": false,
[36m(Runner pid=3309020)[0m "enable_chunked_prefill": false,
[36m(Runner pid=3309020)[0m "tensor_parallel_size": 1,
[36m(Runner pid=3309020)[0m "max_num_batched_tokens": 8192,
[36m(Runner pid=3309020)[0m "max_num_seqs": 1024,
[36m(Runner pid=3309020)[0m "disable_log_stats": true,
[36m(Runner pid=3309020)[0m "val_override_config": {
[36m(Runner pid=3309020)[0m "temperature": 0.5,
[36m(Runner pid=3309020)[0m "n": 1
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "prompt_length": 2048,
[36m(Runner pid=3309020)[0m "response_length": 5500
[36m(Runner pid=3309020)[0m }
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "algorithm": {
[36m(Runner pid=3309020)[0m "gamma": 1.0,
[36m(Runner pid=3309020)[0m "lam": 1.0,
[36m(Runner pid=3309020)[0m "adv_estimator": "grpo",
[36m(Runner pid=3309020)[0m "disable_kl": false,
[36m(Runner pid=3309020)[0m "use_kl_loss": true,
[36m(Runner pid=3309020)[0m "kl_penalty": "low_var_kl",
[36m(Runner pid=3309020)[0m "kl_coef": 0.01,
[36m(Runner pid=3309020)[0m "kl_type": "fixed",
[36m(Runner pid=3309020)[0m "kl_horizon": 0.0,
[36m(Runner pid=3309020)[0m "kl_target": 0.0
[36m(Runner pid=3309020)[0m },
[36m(Runner pid=3309020)[0m "trainer": {
[36m(Runner pid=3309020)[0m "total_episodes": 10,
[36m(Runner pid=3309020)[0m "max_steps": null,
[36m(Runner pid=3309020)[0m "project_name": "easy_r1",
[36m(Runner pid=3309020)[0m "experiment_name": "qwen2_5_vl_3b_GEOQA_8K_R1V",
[36m(Runner pid=3309020)[0m "logger": [
[36m(Runner pid=3309020)[0m "console",
[36m(Runner pid=3309020)[0m "wandb"
[36m(Runner pid=3309020)[0m ],
[36m(Runner pid=3309020)[0m "nnodes": 1,
[36m(Runner pid=3309020)[0m "n_gpus_per_node": 2,
[36m(Runner pid=3309020)[0m "critic_warmup": 0,
[36m(Runner pid=3309020)[0m "val_freq": 5,
[36m(Runner pid=3309020)[0m "val_before_train": true,
[36m(Runner pid=3309020)[0m "val_only": false,
[36m(Runner pid=3309020)[0m "val_generations_to_log": 5,
[36m(Runner pid=3309020)[0m "save_freq": 5,
[36m(Runner pid=3309020)[0m "save_limit": 3,
[36m(Runner pid=3309020)[0m "save_checkpoint_path": "checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V",
[36m(Runner pid=3309020)[0m "load_checkpoint_path": null
[36m(Runner pid=3309020)[0m }
[36m(Runner pid=3309020)[0m }
[36m(Runner pid=3309020)[0m Size of train_dataset: 8031
[36m(Runner pid=3309020)[0m Train data file: leonardPKU/GEOQA_8K_R1V@train
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(Runner pid=3309020)[0m {'problem': 'In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?', 'multi_modal_data': {'image': []}, 'multi_modal_inputs': {'pixel_values': tensor([[1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m ...,
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459]]), 'image_grid_thw': tensor([[ 1, 22, 62]])}, 'input_ids': tensor([151643, 151643, 151643, ..., 151644, 77091, 198]), 'attention_mask': tensor([0, 0, 0, ..., 1, 1, 1]), 'position_ids': tensor([[ 0, 0, 0, ..., 144, 145, 146],
[36m(Runner pid=3309020)[0m [ 0, 0, 0, ..., 144, 145, 146],
[36m(Runner pid=3309020)[0m [ 0, 0, 0, ..., 144, 145, 146]]), 'raw_prompt_ids': [151644, 8948, 198, 2610, 525, 264, 10950, 15235, 21388, 11, 6188, 311, 3897, 1632, 5504, 1497, 291, 323, 11682, 14507, 13, 1446, 34813, 1744, 911, 279, 32711, 1882, 438, 458, 5306, 1615, 76728, 323, 1221, 3410, 279, 1196, 448, 279, 4226, 13, 576, 32711, 1882, 27732, 7206, 43810, 2878, 366, 26865, 29, 323, 690, 26865, 29, 9492, 11, 323, 279, 1590, 4226, 27732, 7206, 43810, 2878, 366, 9217, 29, 323, 690, 9217, 29, 9492, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151653, 641, 279, 2661, 13549, 11, 421, 9210, 220, 16, 702, 264, 6629, 315, 220, 18, 20, 13, 15, 12348, 11, 1128, 374, 279, 6629, 315, 9210, 220, 17, 30, 151645, 198, 151644, 77091, 198], 'ground_truth': '145°'}
[36m(Runner pid=3309020)[0m Size of val_dataset: 754
[36m(Runner pid=3309020)[0m Val data file: leonardPKU/GEOQA_8K_R1V@test
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(Runner pid=3309020)[0m {'problem': 'In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?', 'multi_modal_data': {'image': []}, 'multi_modal_inputs': {'pixel_values': tensor([[1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(WorkerDict pid=3319288)[0m
Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]
[36m(WorkerDict pid=3319541)[0m You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
[36m(WorkerDict pid=3319288)[0m
Loading checkpoint shards: 50%|█████ | 1/2 [00:00<00:00, 6.92it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 10.64it/s]
[36m(WorkerDict pid=3319288)[0m [rank0]:[W407 21:25:33.103225236 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m ...,
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459],
[36m(Runner pid=3309020)[0m [1.9303, 1.9303, 1.9303, ..., 2.1459, 2.1459, 2.1459]]), 'image_grid_thw': tensor([[ 1, 22, 62]])}, 'input_ids': tensor([151643, 151643, 151643, ..., 151644, 77091, 198]), 'attention_mask': tensor([0, 0, 0, ..., 1, 1, 1]), 'position_ids': tensor([[ 0, 0, 0, ..., 144, 145, 146],
[36m(Runner pid=3309020)[0m [ 0, 0, 0, ..., 144, 145, 146],
[36m(Runner pid=3309020)[0m [ 0, 0, 0, ..., 144, 145, 146]]), 'raw_prompt_ids': [151644, 8948, 198, 2610, 525, 264, 10950, 15235, 21388, 11, 6188, 311, 3897, 1632, 5504, 1497, 291, 323, 11682, 14507, 13, 1446, 34813, 1744, 911, 279, 32711, 1882, 438, 458, 5306, 1615, 76728, 323, 1221, 3410, 279, 1196, 448, 279, 4226, 13, 576, 32711, 1882, 27732, 7206, 43810, 2878, 366, 26865, 29, 323, 690, 26865, 29, 9492, 11, 323, 279, 1590, 4226, 27732, 7206, 43810, 2878, 366, 9217, 29, 323, 690, 9217, 29, 9492, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151653, 641, 279, 2661, 13549, 11, 421, 9210, 220, 16, 702, 264, 6629, 315, 220, 18, 20, 13, 15, 12348, 11, 1128, 374, 279, 6629, 315, 9210, 220, 17, 30, 151645, 198, 151644, 77091, 198], 'ground_truth': '145°'}
[36m(Runner pid=3309020)[0m Size of train dataloader: 15
[36m(Runner pid=3309020)[0m Size of val dataloader: 1
[36m(Runner pid=3309020)[0m Total training steps: 150
[36m(pid=3319288)[0m INFO 04-07 21:25:24 [__init__.py:239] Automatically detected platform cuda.
[36m(pid=3319541)[0m INFO 04-07 21:25:30 [__init__.py:239] Automatically detected platform cuda.
[36m(WorkerDict pid=3319288)[0m actor will use global batch size 640.
[36m(WorkerDict pid=3319288)[0m Model config: Qwen2_5_VLConfig {
[36m(WorkerDict pid=3319288)[0m "architectures": [
[36m(WorkerDict pid=3319288)[0m "Qwen2_5_VLForConditionalGeneration"
[36m(WorkerDict pid=3319288)[0m ],
[36m(WorkerDict pid=3319288)[0m "attention_dropout": 0.0,
[36m(WorkerDict pid=3319288)[0m "eos_token_id": 151645,
[36m(WorkerDict pid=3319288)[0m "hidden_act": "silu",
[36m(WorkerDict pid=3319288)[0m "hidden_size": 2048,
[36m(WorkerDict pid=3319288)[0m "image_token_id": 151655,
[36m(WorkerDict pid=3319288)[0m "initializer_range": 0.02,
[36m(WorkerDict pid=3319288)[0m "intermediate_size": 11008,
[36m(WorkerDict pid=3319288)[0m "max_position_embeddings": 128000,
[36m(WorkerDict pid=3319288)[0m "max_window_layers": 70,
[36m(WorkerDict pid=3319288)[0m "model_type": "qwen2_5_vl",
[36m(WorkerDict pid=3319288)[0m "num_attention_heads": 16,
[36m(WorkerDict pid=3319288)[0m "num_hidden_layers": 36,
[36m(WorkerDict pid=3319288)[0m "num_key_value_heads": 2,
[36m(WorkerDict pid=3319288)[0m "pad_token_id": 151643,
[36m(WorkerDict pid=3319288)[0m "rms_norm_eps": 1e-06,
[36m(WorkerDict pid=3319288)[0m "rope_scaling": {
[36m(WorkerDict pid=3319288)[0m "mrope_section": [
[36m(WorkerDict pid=3319288)[0m 16,
[36m(WorkerDict pid=3319288)[0m 24,
[36m(WorkerDict pid=3319288)[0m 24
[36m(WorkerDict pid=3319288)[0m ],
[36m(WorkerDict pid=3319288)[0m "rope_type": "default",
[36m(WorkerDict pid=3319288)[0m "type": "default"
[36m(WorkerDict pid=3319288)[0m },
[36m(WorkerDict pid=3319288)[0m "rope_theta": 1000000.0,
[36m(WorkerDict pid=3319288)[0m "sliding_window": 32768,
[36m(WorkerDict pid=3319288)[0m "tie_word_embeddings": true,
[36m(WorkerDict pid=3319288)[0m "torch_dtype": "bfloat16",
[36m(WorkerDict pid=3319288)[0m "transformers_version": "4.50.3",
[36m(WorkerDict pid=3319288)[0m "use_cache": true,
[36m(WorkerDict pid=3319288)[0m "use_sliding_window": false,
[36m(WorkerDict pid=3319288)[0m "video_token_id": 151656,
[36m(WorkerDict pid=3319288)[0m "vision_config": {
[36m(WorkerDict pid=3319288)[0m "depth": 32,
[36m(WorkerDict pid=3319288)[0m "fullatt_block_indexes": [
[36m(WorkerDict pid=3319288)[0m 7,
[36m(WorkerDict pid=3319288)[0m 15,
[36m(WorkerDict pid=3319288)[0m 23,
[36m(WorkerDict pid=3319288)[0m 31
[36m(WorkerDict pid=3319288)[0m ],
[36m(WorkerDict pid=3319288)[0m "hidden_act": "silu",
[36m(WorkerDict pid=3319288)[0m "hidden_size": 1280,
[36m(WorkerDict pid=3319288)[0m "in_channels": 3,
[36m(WorkerDict pid=3319288)[0m "in_chans": 3,
[36m(WorkerDict pid=3319288)[0m "intermediate_size": 3420,
[36m(WorkerDict pid=3319288)[0m "model_type": "qwen2_5_vl",
[36m(WorkerDict pid=3319288)[0m "num_heads": 16,
[36m(WorkerDict pid=3319288)[0m "out_hidden_size": 2048,
[36m(WorkerDict pid=3319288)[0m "patch_size": 14,
[36m(WorkerDict pid=3319288)[0m "spatial_merge_size": 2,
[36m(WorkerDict pid=3319288)[0m "spatial_patch_size": 14,
[36m(WorkerDict pid=3319288)[0m "temporal_patch_size": 2,
[36m(WorkerDict pid=3319288)[0m "tokens_per_second": 2,
[36m(WorkerDict pid=3319288)[0m "window_size": 112
[36m(WorkerDict pid=3319288)[0m },
[36m(WorkerDict pid=3319288)[0m "vision_end_token_id": 151653,
[36m(WorkerDict pid=3319288)[0m "vision_start_token_id": 151652,
[36m(WorkerDict pid=3319288)[0m "vision_token_id": 151654,
[36m(WorkerDict pid=3319288)[0m "vocab_size": 151936
[36m(WorkerDict pid=3319288)[0m }
[36m(WorkerDict pid=3319288)[0m
[36m(WorkerDict pid=3319288)[0m Ulysses patch applied!
[36m(WorkerDict pid=3319288)[0m NCCL version 2.21.5+cuda12.4
[36m(WorkerDict pid=3319288)[0m Qwen2_5_VLForConditionalGeneration contains 3.75B parameters.
[36m(WorkerDict pid=3319288)[0m After huggingface model init: 0.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m FSDP wrap policy: functools.partial(, transformer_layer_cls={, }).
[36m(WorkerDict pid=3319288)[0m After FSDP module init: 3.49 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Model config: Qwen2_5_VLConfig {
[36m(WorkerDict pid=3319288)[0m "architectures": [
[36m(WorkerDict pid=3319288)[0m "Qwen2_5_VLForConditionalGeneration"
[36m(WorkerDict pid=3319288)[0m ],
[36m(WorkerDict pid=3319288)[0m "attention_dropout": 0.0,
[36m(WorkerDict pid=3319288)[0m "eos_token_id": 151645,
[36m(WorkerDict pid=3319288)[0m "hidden_act": "silu",
[36m(WorkerDict pid=3319288)[0m "hidden_size": 2048,
[36m(WorkerDict pid=3319288)[0m "image_token_id": 151655,
[36m(WorkerDict pid=3319288)[0m "initializer_range": 0.02,
[36m(WorkerDict pid=3319288)[0m "intermediate_size": 11008,
[36m(WorkerDict pid=3319288)[0m "max_position_embeddings": 128000,
[36m(WorkerDict pid=3319288)[0m "max_window_layers": 70,
[36m(WorkerDict pid=3319288)[0m "model_type": "qwen2_5_vl",
[36m(WorkerDict pid=3319288)[0m "num_attention_heads": 16,
[36m(WorkerDict pid=3319288)[0m "num_hidden_layers": 36,
[36m(WorkerDict pid=3319288)[0m "num_key_value_heads": 2,
[36m(WorkerDict pid=3319288)[0m "pad_token_id": 151643,
[36m(WorkerDict pid=3319288)[0m "rms_norm_eps": 1e-06,
[36m(WorkerDict pid=3319288)[0m "rope_scaling": {
[36m(WorkerDict pid=3319288)[0m "mrope_section": [
[36m(WorkerDict pid=3319288)[0m 16,
[36m(WorkerDict pid=3319288)[0m 24,
[36m(WorkerDict pid=3319288)[0m 24
[36m(WorkerDict pid=3319288)[0m ],
[36m(WorkerDict pid=3319288)[0m "rope_type": "default",
[36m(WorkerDict pid=3319288)[0m "type": "default"
[36m(WorkerDict pid=3319288)[0m },
[36m(WorkerDict pid=3319288)[0m "rope_theta": 1000000.0,
[36m(WorkerDict pid=3319288)[0m "sliding_window": 32768,
[36m(WorkerDict pid=3319288)[0m "tie_word_embeddings": true,
[36m(WorkerDict pid=3319288)[0m Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen2_5_VLForConditionalGeneration is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
[36m(WorkerDict pid=3319288)[0m Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen2_5_VisionTransformerPretrainedModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
[36m(WorkerDict pid=3319541)[0m [rank1]:[W407 21:25:33.080654646 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
[36m(WorkerDict pid=3319288)[0m
Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]
[36m(WorkerDict pid=3319288)[0m
Loading checkpoint shards: 50%|█████ | 1/2 [00:03<00:03, 3.65s/it]
[36m(WorkerDict pid=3319288)[0m
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00, 3.28s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00, 3.33s/it]
[36m(WorkerDict pid=3319541)[0m Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen2_5_VLForConditionalGeneration is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
[36m(WorkerDict pid=3319288)[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
[36m(WorkerDict pid=3319288)[0m
Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00, ?it/s]
[36m(WorkerDict pid=3319288)[0m
Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:00<00:00, 1.66it/s]
[36m(WorkerDict pid=3319288)[0m
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00, 1.35it/s]
[36m(WorkerDict pid=3319288)[0m
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00, 1.39it/s]
[36m(WorkerDict pid=3319288)[0m
[36m(WorkerDict pid=3319288)[0m "torch_dtype": "bfloat16",
[36m(WorkerDict pid=3319288)[0m "transformers_version": "4.50.3",
[36m(WorkerDict pid=3319288)[0m "use_cache": true,
[36m(WorkerDict pid=3319288)[0m "use_sliding_window": false,
[36m(WorkerDict pid=3319288)[0m "video_token_id": 151656,
[36m(WorkerDict pid=3319288)[0m "vision_config": {
[36m(WorkerDict pid=3319288)[0m "depth": 32,
[36m(WorkerDict pid=3319288)[0m "fullatt_block_indexes": [
[36m(WorkerDict pid=3319288)[0m 7,
[36m(WorkerDict pid=3319288)[0m 15,
[36m(WorkerDict pid=3319288)[0m 23,
[36m(WorkerDict pid=3319288)[0m 31
[36m(WorkerDict pid=3319288)[0m ],
[36m(WorkerDict pid=3319288)[0m "hidden_act": "silu",
[36m(WorkerDict pid=3319288)[0m "hidden_size": 1280,
[36m(WorkerDict pid=3319288)[0m "in_channels": 3,
[36m(WorkerDict pid=3319288)[0m "in_chans": 3,
[36m(WorkerDict pid=3319288)[0m "intermediate_size": 3420,
[36m(WorkerDict pid=3319288)[0m "model_type": "qwen2_5_vl",
[36m(WorkerDict pid=3319288)[0m "num_heads": 16,
[36m(WorkerDict pid=3319288)[0m "out_hidden_size": 2048,
[36m(WorkerDict pid=3319288)[0m "patch_size": 14,
[36m(WorkerDict pid=3319288)[0m "spatial_merge_size": 2,
[36m(WorkerDict pid=3319288)[0m "spatial_patch_size": 14,
[36m(WorkerDict pid=3319288)[0m "temporal_patch_size": 2,
[36m(WorkerDict pid=3319288)[0m "tokens_per_second": 2,
[36m(WorkerDict pid=3319288)[0m "window_size": 112
[36m(WorkerDict pid=3319288)[0m },
[36m(WorkerDict pid=3319288)[0m "vision_end_token_id": 151653,
[36m(WorkerDict pid=3319288)[0m "vision_start_token_id": 151652,
[36m(WorkerDict pid=3319288)[0m "vision_token_id": 151654,
[36m(WorkerDict pid=3319288)[0m "vocab_size": 151936
[36m(WorkerDict pid=3319288)[0m }
[36m(WorkerDict pid=3319288)[0m
[36m(WorkerDict pid=3319288)[0m Ulysses patch applied!
[36m(WorkerDict pid=3319288)[0m Qwen2_5_VLForConditionalGeneration contains 3.75B parameters.
[36m(WorkerDict pid=3319288)[0m After huggingface model init: 3.49 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m FSDP wrap policy: functools.partial(, transformer_layer_cls={, }).
[36m(WorkerDict pid=3319288)[0m After FSDP module init: 12.82 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After optimizer init: 12.82 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After offload actor model during init: 0.66 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After offload actor optimizer during init: 0.66 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:25:58 [config.py:585] This model supports multiple tasks: {'generate', 'score', 'classify', 'reward', 'embed'}. Defaulting to 'generate'.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:25:58 [config.py:1478] Disabling V1 multiprocessing for external launcher.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:25:58 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:25:59 [config.py:585] This model supports multiple tasks: {'reward', 'generate', 'classify', 'embed', 'score'}. Defaulting to 'generate'.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:25:59 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct', speculative_config=None, tokenizer='/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=7548, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"level":3,"custom_ops":["none"],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output"],"use_inductor":true,"compile_sizes":[],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":512}
[36m(WorkerDict pid=3319541)[0m WARNING 04-07 21:25:59 [utils.py:2321] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:00 [parallel_state.py:954] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, TP rank 0
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:00 [cuda.py:220] Using Flash Attention backend on V1 engine.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:00 [gpu_model_runner.py:1174] Starting to load model /home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct...
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:00 [config.py:3243] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512] is overridden by config [512, 384, 256, 128, 4, 2, 1, 392, 264, 136, 8, 400, 272, 144, 16, 408, 280, 152, 24, 416, 288, 160, 32, 424, 296, 168, 40, 432, 304, 176, 48, 440, 312, 184, 56, 448, 320, 192, 64, 456, 328, 200, 72, 464, 336, 208, 80, 472, 344, 216, 88, 120, 480, 352, 248, 224, 96, 488, 504, 360, 232, 104, 496, 368, 240, 112, 376]
[36m(WorkerDict pid=3319288)[0m WARNING 04-07 21:26:00 [topk_topp_sampler.py:63] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:02 [loader.py:447] Loading weights took 1.51 seconds
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:02 [gpu_model_runner.py:1186] Model loading took 7.1557 GB and 1.687760 seconds
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:02 [gpu_model_runner.py:1456] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 image items of the maximum feature size.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:15 [backends.py:415] Using cache directory: /home/huzhe/.cache/vllm/torch_compile_cache/10d149c86c/rank_0_0 for vLLM's torch.compile
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:15 [backends.py:425] Dynamo bytecode transform time: 6.98 s
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:25:59 [config.py:1478] Disabling V1 multiprocessing for external launcher.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:25:59 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.
[36m(Runner pid=3309020)[0m wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[36m(WorkerDict pid=3319541)[0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:25:59 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct', speculative_config=None, tokenizer='/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=7548, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"level":3,"custom_ops":["none"],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output"],"use_inductor":true,"compile_sizes":[],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":512}
[36m(WorkerDict pid=3319288)[0m WARNING 04-07 21:25:59 [utils.py:2321] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:00 [parallel_state.py:954] rank 1 in world size 2 is assigned as DP rank 0, PP rank 0, TP rank 0
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:00 [cuda.py:220] Using Flash Attention backend on V1 engine.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:00 [gpu_model_runner.py:1174] Starting to load model /home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct...
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:00 [config.py:3243] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512] is overridden by config [512, 384, 256, 128, 4, 2, 1, 392, 264, 136, 8, 400, 272, 144, 16, 408, 280, 152, 24, 416, 288, 160, 32, 424, 296, 168, 40, 432, 304, 176, 48, 440, 312, 184, 56, 448, 320, 192, 64, 456, 328, 200, 72, 464, 336, 208, 80, 472, 344, 216, 88, 120, 480, 352, 248, 224, 96, 488, 504, 360, 232, 104, 496, 368, 240, 112, 376]
[36m(WorkerDict pid=3319541)[0m WARNING 04-07 21:26:00 [topk_topp_sampler.py:63] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:02 [loader.py:447] Loading weights took 1.47 seconds
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:02 [gpu_model_runner.py:1186] Model loading took 7.1557 GB and 1.651890 seconds
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:02 [gpu_model_runner.py:1456] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 image items of the maximum feature size.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:16 [backends.py:115] Directly load the compiled graph for shape None from the cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:22 [monitor.py:33] torch.compile takes 6.98 s in total
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:15 [backends.py:415] Using cache directory: /home/huzhe/.cache/vllm/torch_compile_cache/10d149c86c/rank_1_0 for vLLM's torch.compile
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:15 [backends.py:425] Dynamo bytecode transform time: 7.00 s
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:16 [backends.py:115] Directly load the compiled graph for shape None from the cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:23 [kv_cache_utils.py:566] GPU KV cache size: 978,160 tokens
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:23 [kv_cache_utils.py:569] Maximum concurrency for 7,548 tokens per request: 129.59x
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:45 [gpu_model_runner.py:1534] Graph capturing finished in 22 secs, took 0.55 GiB
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:22 [monitor.py:33] torch.compile takes 7.00 s in total
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:23 [kv_cache_utils.py:566] GPU KV cache size: 978,160 tokens
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:23 [kv_cache_utils.py:569] Maximum concurrency for 7,548 tokens per request: 129.59x
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:45 [core.py:151] init engine (profile, create kv cache, warmup model) took 43.04 seconds
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:45 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:49 [gpu_worker.py:81] Sleep mode freed 40.90 GiB memory, 6.90 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:26:49 [executor_base.py:208] It took 4.453526 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Sampling params: {'max_tokens': 5500, 'detokenize': False, 'n': 5, 'temperature': 1.0, 'top_p': 1.0, 'top_k': -1, 'ignore_eos': False}.
[36m(WorkerDict pid=3319288)[0m After vllm init: 6.90 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m Config
[36m(Runner pid=3309020)[0m algorithm:
[36m(Runner pid=3309020)[0m adv_estimator: grpo
[36m(Runner pid=3309020)[0m disable_kl: false
[36m(Runner pid=3309020)[0m gamma: 1.0
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_horizon: 0.0
[36m(Runner pid=3309020)[0m kl_penalty: low_var_kl
[36m(Runner pid=3309020)[0m kl_target: 0.0
[36m(Runner pid=3309020)[0m kl_type: fixed
[36m(Runner pid=3309020)[0m lam: 1.0
[36m(Runner pid=3309020)[0m use_kl_loss: true
[36m(Runner pid=3309020)[0m data:
[36m(Runner pid=3309020)[0m answer_key: answer
[36m(Runner pid=3309020)[0m image_key: images
[36m(Runner pid=3309020)[0m max_pixels: 4194304
[36m(Runner pid=3309020)[0m max_prompt_length: 2048
[36m(Runner pid=3309020)[0m max_response_length: 5500
[36m(Runner pid=3309020)[0m min_pixels: 262144
[36m(Runner pid=3309020)[0m prompt_key: problem
[36m(Runner pid=3309020)[0m rollout_batch_size: 512
[36m(Runner pid=3309020)[0m seed: 1
[36m(Runner pid=3309020)[0m shuffle: true
[36m(Runner pid=3309020)[0m system_prompt: You are a helpful AI Assistant, designed to provided well-reasoned
[36m(Runner pid=3309020)[0m and detailed responses. You FIRST think about the reasoning process as an internal
[36m(Runner pid=3309020)[0m monologue and then provide the user with the answer. The reasoning process MUST
[36m(Runner pid=3309020)[0m BE enclosed within and tags, and the final answer MUST BE enclosed
[36m(Runner pid=3309020)[0m within and tags.
[36m(Runner pid=3309020)[0m train_files: leonardPKU/GEOQA_8K_R1V@train
[36m(Runner pid=3309020)[0m val_batch_size: -1
[36m(Runner pid=3309020)[0m val_files: leonardPKU/GEOQA_8K_R1V@test
[36m(Runner pid=3309020)[0m trainer:
[36m(Runner pid=3309020)[0m critic_warmup: 0
[36m(Runner pid=3309020)[0m experiment_name: qwen2_5_vl_3b_GEOQA_8K_R1V
[36m(Runner pid=3309020)[0m load_checkpoint_path: null
[36m(Runner pid=3309020)[0m logger:
[36m(Runner pid=3309020)[0m - console
[36m(Runner pid=3309020)[0m wandb: Currently logged in as: ddderek (ddderek-hk-polyu) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
[36m(Runner pid=3309020)[0m wandb: Tracking run with wandb version 0.19.8
[36m(Runner pid=3309020)[0m wandb: Run data is saved locally in /home/huzhe/workspace/EasyR1/wandb/run-20250407_212650-d1xtspm0
[36m(Runner pid=3309020)[0m wandb: Run `wandb offline` to turn off syncing.
[36m(Runner pid=3309020)[0m wandb: Syncing run qwen2_5_vl_3b_GEOQA_8K_R1V
[36m(Runner pid=3309020)[0m wandb: ⭐️ View project at https://wandb.ai/ddderek-hk-polyu/easy_r1
[36m(Runner pid=3309020)[0m wandb: 🚀 View run at https://wandb.ai/ddderek-hk-polyu/easy_r1/runs/d1xtspm0
[36m(Runner pid=3309020)[0m - wandb
[36m(Runner pid=3309020)[0m max_steps: null
[36m(Runner pid=3309020)[0m n_gpus_per_node: 2
[36m(Runner pid=3309020)[0m nnodes: 1
[36m(Runner pid=3309020)[0m project_name: easy_r1
[36m(Runner pid=3309020)[0m save_checkpoint_path: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V
[36m(Runner pid=3309020)[0m save_freq: 5
[36m(Runner pid=3309020)[0m save_limit: 3
[36m(Runner pid=3309020)[0m total_episodes: 10
[36m(Runner pid=3309020)[0m val_before_train: true
[36m(Runner pid=3309020)[0m val_freq: 5
[36m(Runner pid=3309020)[0m val_generations_to_log: 5
[36m(Runner pid=3309020)[0m val_only: false
[36m(Runner pid=3309020)[0m worker:
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m clip_ratio: 0.2
[36m(Runner pid=3309020)[0m disable_kl: false
[36m(Runner pid=3309020)[0m fsdp:
[36m(Runner pid=3309020)[0m enable_cpu_offload: false
[36m(Runner pid=3309020)[0m enable_full_shard: true
[36m(Runner pid=3309020)[0m enable_rank0_init: true
[36m(Runner pid=3309020)[0m fsdp_size: -1
[36m(Runner pid=3309020)[0m mp_buffer_dtype: fp32
[36m(Runner pid=3309020)[0m mp_param_dtype: bf16
[36m(Runner pid=3309020)[0m mp_reduce_dtype: fp32
[36m(Runner pid=3309020)[0m torch_dtype: null
[36m(Runner pid=3309020)[0m use_orig_params: false
[36m(Runner pid=3309020)[0m global_batch_size: 128
[36m(Runner pid=3309020)[0m global_batch_size_per_device: -1
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_penalty: low_var_kl
[36m(Runner pid=3309020)[0m max_grad_norm: 1.0
[36m(Runner pid=3309020)[0m micro_batch_size_per_device_for_experience: 16
[36m(Runner pid=3309020)[0m micro_batch_size_per_device_for_update: 4
[36m(Runner pid=3309020)[0m model:
[36m(Runner pid=3309020)[0m enable_gradient_checkpointing: true
[36m(Runner pid=3309020)[0m freeze_vision_tower: false
[36m(Runner pid=3309020)[0m model_path: /home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct
[36m(Runner pid=3309020)[0m override_config: {}
[36m(Runner pid=3309020)[0m tokenizer_path: /home/huzhe/workspace/model_card/Qwen2.5-VL-3B-Instruct
[36m(Runner pid=3309020)[0m trust_remote_code: false
[36m(Runner pid=3309020)[0m offload:
[36m(Runner pid=3309020)[0m offload_optimizer: true
[36m(Runner pid=3309020)[0m offload_params: true
[36m(Runner pid=3309020)[0m optim:
[36m(Runner pid=3309020)[0m betas:
[36m(Runner pid=3309020)[0m - 0.9
[36m(Runner pid=3309020)[0m - 0.999
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m lr_warmup_ratio: 0.0
[36m(Runner pid=3309020)[0m min_lr_ratio: null
[36m(Runner pid=3309020)[0m strategy: adamw
[36m(Runner pid=3309020)[0m training_steps: 150
[36m(Runner pid=3309020)[0m warmup_style: constant
[36m(Runner pid=3309020)[0m weight_decay: 0.01
[36m(Runner pid=3309020)[0m padding_free: true
[36m(Runner pid=3309020)[0m ppo_epochs: 1
[36m(Runner pid=3309020)[0m strategy: fsdp
[36m(Runner pid=3309020)[0m ulysses_sequence_parallel_size: 1
[36m(Runner pid=3309020)[0m use_kl_loss: true
[36m(Runner pid=3309020)[0m use_torch_compile: true
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m cliprange_value: 0.5
[36m(Runner pid=3309020)[0m fsdp:
[36m(Runner pid=3309020)[0m enable_cpu_offload: false
[36m(Runner pid=3309020)[0m enable_full_shard: true
[36m(Runner pid=3309020)[0m enable_rank0_init: false
[36m(Runner pid=3309020)[0m fsdp_size: -1
[36m(Runner pid=3309020)[0m mp_buffer_dtype: fp32
[36m(Runner pid=3309020)[0m mp_param_dtype: bf16
[36m(Runner pid=3309020)[0m mp_reduce_dtype: fp32
[36m(Runner pid=3309020)[0m torch_dtype: null
[36m(Runner pid=3309020)[0m use_orig_params: false
[36m(Runner pid=3309020)[0m global_batch_size: 256
[36m(Runner pid=3309020)[0m global_batch_size_per_device: -1
[36m(Runner pid=3309020)[0m max_grad_norm: 1.0
[36m(Runner pid=3309020)[0m micro_batch_size_per_device_for_experience: 16
[36m(Runner pid=3309020)[0m micro_batch_size_per_device_for_update: 4
[36m(Runner pid=3309020)[0m model:
[36m(Runner pid=3309020)[0m enable_gradient_checkpointing: true
[36m(Runner pid=3309020)[0m freeze_vision_tower: false
[36m(Runner pid=3309020)[0m model_path: null
[36m(Runner pid=3309020)[0m override_config: {}
[36m(Runner pid=3309020)[0m tokenizer_path: null
[36m(Runner pid=3309020)[0m trust_remote_code: true
[36m(Runner pid=3309020)[0m offload:
[36m(Runner pid=3309020)[0m offload_optimizer: false
[36m(Runner pid=3309020)[0m offload_params: false
[36m(Runner pid=3309020)[0m optim:
[36m(Runner pid=3309020)[0m betas:
[36m(Runner pid=3309020)[0m - 0.9
[36m(Runner pid=3309020)[0m - 0.999
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m lr_warmup_ratio: 0.0
[36m(Runner pid=3309020)[0m min_lr_ratio: null
[36m(Runner pid=3309020)[0m strategy: adamw
[36m(Runner pid=3309020)[0m training_steps: 150
[36m(Runner pid=3309020)[0m warmup_style: constant
[36m(Runner pid=3309020)[0m weight_decay: 0.01
[36m(Runner pid=3309020)[0m padding_free: false
[36m(Runner pid=3309020)[0m ppo_epochs: 1
[36m(Runner pid=3309020)[0m strategy: fsdp
[36m(Runner pid=3309020)[0m ulysses_sequence_parallel_size: 1
[36m(Runner pid=3309020)[0m hybrid_engine: true
[36m(Runner pid=3309020)[0m ref:
[36m(Runner pid=3309020)[0m fsdp:
[36m(Runner pid=3309020)[0m enable_cpu_offload: true
[36m(Runner pid=3309020)[0m enable_full_shard: true
[36m(Runner pid=3309020)[0m enable_rank0_init: true
[36m(Runner pid=3309020)[0m fsdp_size: -1
[36m(Runner pid=3309020)[0m mp_buffer_dtype: fp32
[36m(Runner pid=3309020)[0m mp_param_dtype: bf16
[36m(Runner pid=3309020)[0m mp_reduce_dtype: fp32
[36m(Runner pid=3309020)[0m torch_dtype: null
[36m(Runner pid=3309020)[0m use_orig_params: false
[36m(Runner pid=3309020)[0m micro_batch_size_per_device_for_experience: 16
[36m(Runner pid=3309020)[0m offload:
[36m(Runner pid=3309020)[0m offload_optimizer: false
[36m(Runner pid=3309020)[0m offload_params: false
[36m(Runner pid=3309020)[0m padding_free: true
[36m(Runner pid=3309020)[0m strategy: fsdp
[36m(Runner pid=3309020)[0m ulysses_sequence_parallel_size: 1
[36m(Runner pid=3309020)[0m use_torch_compile: true
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m compute_score: r1v
[36m(Runner pid=3309020)[0m reward_type: function
[36m(Runner pid=3309020)[0m rollout:
[36m(Runner pid=3309020)[0m disable_log_stats: true
[36m(Runner pid=3309020)[0m dtype: bf16
[36m(Runner pid=3309020)[0m enable_chunked_prefill: false
[36m(Runner pid=3309020)[0m enforce_eager: false
[36m(Runner pid=3309020)[0m free_cache_engine: false
[36m(Runner pid=3309020)[0m gpu_memory_utilization: 0.6
[36m(Runner pid=3309020)[0m ignore_eos: false
[36m(Runner pid=3309020)[0m limit_images: 0
[36m(Runner pid=3309020)[0m max_num_batched_tokens: 8192
[36m(Runner pid=3309020)[0m max_num_seqs: 1024
[36m(Runner pid=3309020)[0m n: 5
[36m(Runner pid=3309020)[0m name: vllm
[36m(Runner pid=3309020)[0m prompt_length: 2048
[36m(Runner pid=3309020)[0m response_length: 5500
[36m(Runner pid=3309020)[0m temperature: 1.0
[36m(Runner pid=3309020)[0m tensor_parallel_size: 1
[36m(Runner pid=3309020)[0m top_k: -1
[36m(Runner pid=3309020)[0m top_p: 1.0
[36m(Runner pid=3309020)[0m val_override_config:
[36m(Runner pid=3309020)[0m n: 1
[36m(Runner pid=3309020)[0m temperature: 0.5
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:08<54:03, 8.63s/it, est. speed input: 54.13 toks/s, output: 1.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:10<30:09, 4.83s/it, est. speed input: 94.87 toks/s, output: 1.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:13<23:36, 3.79s/it, est. speed input: 111.97 toks/s, output: 2.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%|▏ | 5/377 [00:13<10:28, 1.69s/it, est. speed input: 193.90 toks/s, output: 3.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 7/377 [00:15<07:55, 1.29s/it, est. speed input: 235.12 toks/s, output: 8.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 9/377 [00:15<05:30, 1.11it/s, est. speed input: 286.61 toks/s, output: 18.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 11/377 [00:15<03:39, 1.66it/s, est. speed input: 343.11 toks/s, output: 29.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 14/377 [00:15<02:10, 2.78it/s, est. speed input: 427.54 toks/s, output: 46.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 18/377 [00:16<01:15, 4.73it/s, est. speed input: 538.61 toks/s, output: 68.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 20/377 [00:16<01:02, 5.70it/s, est. speed input: 589.16 toks/s, output: 80.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 24/377 [00:16<00:41, 8.57it/s, est. speed input: 698.86 toks/s, output: 105.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 27/377 [00:16<00:34, 10.28it/s, est. speed input: 775.12 toks/s, output: 123.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 31/377 [00:16<00:26, 13.18it/s, est. speed input: 876.64 toks/s, output: 149.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 34/377 [00:16<00:22, 15.05it/s, est. speed input: 955.00 toks/s, output: 169.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 41/377 [00:17<00:17, 19.49it/s, est. speed input: 1135.34 toks/s, output: 216.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 44/377 [00:17<00:21, 15.85it/s, est. speed input: 1195.24 toks/s, output: 234.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 50/377 [00:17<00:15, 20.72it/s, est. speed input: 1348.18 toks/s, output: 280.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 53/377 [00:17<00:16, 19.59it/s, est. speed input: 1414.88 toks/s, output: 302.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 56/377 [00:17<00:15, 20.62it/s, est. speed input: 1482.43 toks/s, output: 325.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 63/377 [00:17<00:12, 26.03it/s, est. speed input: 1647.95 toks/s, output: 381.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 66/377 [00:18<00:14, 22.12it/s, est. speed input: 1704.77 toks/s, output: 403.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▊ | 70/377 [00:18<00:13, 23.30it/s, est. speed input: 1793.71 toks/s, output: 434.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 73/377 [00:18<00:12, 23.74it/s, est. speed input: 1857.46 toks/s, output: 459.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 77/377 [00:18<00:11, 26.16it/s, est. speed input: 1943.19 toks/s, output: 493.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 80/377 [00:18<00:11, 26.00it/s, est. speed input: 2005.36 toks/s, output: 519.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 83/377 [00:18<00:11, 25.92it/s, est. speed input: 2068.13 toks/s, output: 545.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▎ | 89/377 [00:18<00:09, 30.80it/s, est. speed input: 2202.15 toks/s, output: 600.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 93/377 [00:19<00:08, 31.94it/s, est. speed input: 2286.01 toks/s, output: 636.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 97/377 [00:19<00:11, 24.45it/s, est. speed input: 2351.02 toks/s, output: 668.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 102/377 [00:19<00:10, 27.31it/s, est. speed input: 2451.50 toks/s, output: 716.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 105/377 [00:19<00:10, 27.10it/s, est. speed input: 2507.27 toks/s, output: 744.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 109/377 [00:19<00:09, 29.23it/s, est. speed input: 2588.01 toks/s, output: 783.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 113/377 [00:19<00:11, 23.36it/s, est. speed input: 2648.64 toks/s, output: 817.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 116/377 [00:20<00:11, 22.90it/s, est. speed input: 2699.14 toks/s, output: 845.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 122/377 [00:20<00:09, 27.43it/s, est. speed input: 2818.51 toks/s, output: 907.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 126/377 [00:20<00:09, 27.52it/s, est. speed input: 2889.86 toks/s, output: 948.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 130/377 [00:20<00:08, 27.63it/s, est. speed input: 2963.56 toks/s, output: 989.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 135/377 [00:20<00:08, 29.81it/s, est. speed input: 3056.16 toks/s, output: 1041.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 139/377 [00:20<00:07, 30.86it/s, est. speed input: 3128.55 toks/s, output: 1085.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 148/377 [00:20<00:05, 43.12it/s, est. speed input: 3310.23 toks/s, output: 1191.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 153/377 [00:21<00:06, 37.22it/s, est. speed input: 3393.18 toks/s, output: 1243.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 158/377 [00:21<00:05, 38.87it/s, est. speed input: 3487.82 toks/s, output: 1301.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 163/377 [00:21<00:06, 35.00it/s, est. speed input: 3572.86 toks/s, output: 1356.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 168/377 [00:21<00:06, 34.12it/s, est. speed input: 3655.16 toks/s, output: 1413.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 175/377 [00:21<00:04, 41.46it/s, est. speed input: 3788.99 toks/s, output: 1501.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 181/377 [00:21<00:04, 45.15it/s, est. speed input: 3901.97 toks/s, output: 1575.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 186/377 [00:21<00:04, 39.39it/s, est. speed input: 3979.00 toks/s, output: 1631.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 191/377 [00:22<00:04, 39.67it/s, est. speed input: 4063.74 toks/s, output: 1693.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 196/377 [00:22<00:05, 33.93it/s, est. speed input: 4135.30 toks/s, output: 1749.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 201/377 [00:22<00:04, 35.84it/s, est. speed input: 4217.54 toks/s, output: 1814.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 206/377 [00:22<00:04, 37.38it/s, est. speed input: 4302.41 toks/s, output: 1879.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 211/377 [00:22<00:04, 38.69it/s, est. speed input: 4384.30 toks/s, output: 1944.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 218/377 [00:22<00:03, 44.35it/s, est. speed input: 4510.75 toks/s, output: 2040.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 225/377 [00:22<00:03, 48.59it/s, est. speed input: 4630.04 toks/s, output: 2138.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 230/377 [00:22<00:03, 44.87it/s, est. speed input: 4709.67 toks/s, output: 2203.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 235/377 [00:23<00:03, 42.77it/s, est. speed input: 4788.88 toks/s, output: 2270.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 240/377 [00:23<00:03, 41.33it/s, est. speed input: 4863.17 toks/s, output: 2337.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 246/377 [00:23<00:03, 39.53it/s, est. speed input: 4950.18 toks/s, output: 2417.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 251/377 [00:23<00:04, 30.14it/s, est. speed input: 4993.58 toks/s, output: 2474.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 255/377 [00:23<00:04, 29.57it/s, est. speed input: 5043.12 toks/s, output: 2527.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 261/377 [00:23<00:03, 32.15it/s, est. speed input: 5127.13 toks/s, output: 2614.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 267/377 [00:24<00:03, 32.52it/s, est. speed input: 5209.10 toks/s, output: 2699.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 274/377 [00:24<00:02, 36.93it/s, est. speed input: 5316.83 toks/s, output: 2809.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▎ | 278/377 [00:24<00:03, 32.98it/s, est. speed input: 5357.37 toks/s, output: 2864.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 282/377 [00:24<00:02, 32.84it/s, est. speed input: 5407.17 toks/s, output: 2923.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 286/377 [00:24<00:02, 32.78it/s, est. speed input: 5458.47 toks/s, output: 2985.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 290/377 [00:24<00:03, 28.65it/s, est. speed input: 5493.98 toks/s, output: 3040.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 293/377 [00:25<00:03, 23.69it/s, est. speed input: 5506.48 toks/s, output: 3074.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 297/377 [00:25<00:03, 24.62it/s, est. speed input: 5550.38 toks/s, output: 3137.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 302/377 [00:25<00:02, 28.06it/s, est. speed input: 5614.24 toks/s, output: 3221.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 306/377 [00:25<00:02, 30.46it/s, est. speed input: 5664.92 toks/s, output: 3291.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 310/377 [00:25<00:02, 22.64it/s, est. speed input: 5676.53 toks/s, output: 3337.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 314/377 [00:25<00:02, 24.28it/s, est. speed input: 5720.90 toks/s, output: 3406.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 317/377 [00:26<00:03, 18.47it/s, est. speed input: 5712.26 toks/s, output: 3434.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 320/377 [00:26<00:03, 18.37it/s, est. speed input: 5729.97 toks/s, output: 3480.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 323/377 [00:26<00:04, 13.33it/s, est. speed input: 5699.42 toks/s, output: 3498.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 326/377 [00:26<00:03, 15.51it/s, est. speed input: 5732.76 toks/s, output: 3556.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 329/377 [00:27<00:03, 15.01it/s, est. speed input: 5739.90 toks/s, output: 3601.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 333/377 [00:27<00:02, 17.46it/s, est. speed input: 5776.36 toks/s, output: 3678.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 336/377 [00:27<00:03, 13.34it/s, est. speed input: 5750.57 toks/s, output: 3705.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 338/377 [00:27<00:03, 9.91it/s, est. speed input: 5702.66 toks/s, output: 3705.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 340/377 [00:28<00:03, 10.70it/s, est. speed input: 5709.29 toks/s, output: 3741.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 343/377 [00:28<00:02, 12.08it/s, est. speed input: 5722.78 toks/s, output: 3800.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:28<00:02, 10.93it/s, est. speed input: 5709.05 toks/s, output: 3823.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 347/377 [00:29<00:04, 6.97it/s, est. speed input: 5626.16 toks/s, output: 3805.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 348/377 [00:39<00:48, 1.68s/it, est. speed input: 4181.62 toks/s, output: 2867.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 349/377 [00:40<00:43, 1.57s/it, est. speed input: 4083.74 toks/s, output: 2841.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 350/377 [00:46<01:05, 2.44s/it, est. speed input: 3579.77 toks/s, output: 2537.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 351/377 [00:46<00:50, 1.95s/it, est. speed input: 3567.60 toks/s, output: 2577.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 352/377 [00:47<00:41, 1.67s/it, est. speed input: 3517.99 toks/s, output: 2589.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▎| 353/377 [00:48<00:36, 1.53s/it, est. speed input: 3444.98 toks/s, output: 2585.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 354/377 [00:49<00:34, 1.50s/it, est. speed input: 3357.44 toks/s, output: 2569.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 355/377 [00:51<00:31, 1.42s/it, est. speed input: 3286.97 toks/s, output: 2567.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 356/377 [00:52<00:28, 1.36s/it, est. speed input: 3219.51 toks/s, output: 2567.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 357/377 [00:52<00:20, 1.02s/it, est. speed input: 3217.15 toks/s, output: 2617.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 358/377 [00:54<00:23, 1.25s/it, est. speed input: 3118.52 toks/s, output: 2591.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 360/377 [00:56<00:20, 1.19s/it, est. speed input: 3011.67 toks/s, output: 2609.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:56<00:15, 1.04it/s, est. speed input: 3005.42 toks/s, output: 2659.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 362/377 [00:57<00:13, 1.09it/s, est. speed input: 2972.13 toks/s, output: 2686.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 363/377 [00:59<00:16, 1.16s/it, est. speed input: 2889.01 toks/s, output: 2668.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 364/377 [01:00<00:16, 1.28s/it, est. speed input: 2822.11 toks/s, output: 2665.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 365/377 [01:01<00:13, 1.08s/it, est. speed input: 2802.06 toks/s, output: 2705.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [01:02<00:10, 1.06it/s, est. speed input: 2783.74 toks/s, output: 2745.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 367/377 [01:02<00:08, 1.15it/s, est. speed input: 2760.04 toks/s, output: 2781.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [01:03<00:08, 1.10it/s, est. speed input: 2724.46 toks/s, output: 2805.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 369/377 [01:07<00:14, 1.76s/it, est. speed input: 2579.17 toks/s, output: 2718.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [01:14<00:22, 3.24s/it, est. speed input: 2351.81 toks/s, output: 2545.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 377/377 [01:14<00:00, 5.07it/s, est. speed input: 2394.99 toks/s, output: 3059.76 toks/s]
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:45 [gpu_model_runner.py:1534] Graph capturing finished in 22 secs, took 0.55 GiB
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:45 [core.py:151] init engine (profile, create kv cache, warmup model) took 43.26 seconds
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:45 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.45 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:50 [gpu_worker.py:81] Sleep mode freed 40.90 GiB memory, 6.90 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:26:50 [executor_base.py:208] It took 4.452175 seconds to fall asleep.
[36m(WorkerDict pid=3319541)[0m Sampling params: {'max_tokens': 5500, 'detokenize': False, 'n': 5, 'temperature': 1.0, 'top_p': 1.0, 'top_k': -1, 'ignore_eos': False}.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 18.91 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:27:28 [executor_base.py:219] It took 0.337972 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.83 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.36 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m NCCL version 2.21.5+cuda12.4
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 49.80 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:29:30 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:27:28 [executor_base.py:219] It took 0.337642 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:29:31 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 8.88 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:29:31 [executor_base.py:208] It took 0.324036 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 8.88 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:29:32 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:29:33 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 8.90 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:29:33 [executor_base.py:208] It took 0.328974 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to consider the properties of the triangle and the given conditions. Since D is the midpoint of AB and BD = 2.0, AD = 2.0 as well. The line l is a vertical line passing through D, and E is any point on l. The perimeter of triangle AEC is the sum of the lengths of AE, EC, and AC. To minimize this perimeter, we need to consider the geometric properties and the constraints given.\n\nSince D is the midpoint of AB, AE + EC will be minimized when E lies on the line perpendicular to AB passing through D, which is the altitude from D to AB. This is because the shortest distance between two points is a straight line, and in this case, the line segment AE + EC will be minimized when E is directly below D on the line l.\n\nThe minimum perimeter of triangle AEC is then the sum of the lengths of AE and EC, which is equal to the length of AD plus the length of DC. Since AD = 2.0 and DC = 2.0 (as D is the midpoint of AB and BD = 2.0), the minimum perimeter of triangle AEC is 2.0 + 2.0 = 4.0.\n\nTherefore, the minimum perimeter of triangle AEC is 4.0.\n\n\n4.0\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["38.0\n\nGiven that triangle ABC is rotated clockwise around point A by 90 degrees to form triangle AB'C', we know that angle BAC = 90 degrees. This rotation implies that triangle ABC is congruent to triangle AB'C' and that angle B'AC' = 90 degrees. Since angle CC'B' = 32 degrees, we can deduce that angle AC'B' is the remaining angle in the right triangle AC'B', which is 90 degrees - 32 degrees = 58 degrees. However, we need to find the size of angle AC'B' in the context of the original triangle ABC, which is the same as angle AC'B'. Since angle AC'B' is part of the original triangle and the rotation does not change the internal angles, angle AC'B' remains unchanged. Therefore, angle AC'B' = 58 degrees. However, the question asks for the size of angle AC'B' in the context of the rotated triangle AB'C', which is the same as the original triangle ABC. Therefore, the size of angle AC'B' is 58 degrees. However, the answer provided is 38 degrees, which suggests that there might be a misunderstanding or a different interpretation of the problem. Given the options provided, the closest match to the calculated 58 degrees is 38 degrees, which might be a result of a different interpretation or a different set of given values. However, based on the standard geometric properties and the given information, the correct answer should be 58 degrees, not 38 degrees."]
[36m(Runner pid=3309020)[0m [score] [0.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['72°\n\n\n1. Since A, B, and D are on circle O, and ABD is a triangle inscribed in circle O, angle ADB is a right angle (90°) because the angle subtended by a diameter in a semicircle is a right angle.\n\n2. Given that angle ADC = 54.0°, we can find angle ADB by subtracting this from 90°, as the sum of angles in a triangle is 180°. Therefore, angle ADB = 90° - 54° = 36°.\n\n3. Since A, B, and D are on circle O, angle AEB is an inscribed angle that subtends the same arc as angle ADB. The measure of an inscribed angle is half the measure of the arc it subtends.\n\n4. Therefore, angle AEB = 1/2 * angle ADB = 1/2 * 36° = 18°.\n\n5. However, this is not the final answer because we need to consider the entire circle. The diameter BE is a straight line, so angle AEB is actually the angle subtended by the arc BE at point A, which is half the measure of the arc BE.\n\n6. Since BE is the diameter, the arc BE is 180°. Therefore, angle AEB = 1/2 * 180° = 90°.\n\n7. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n8. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n9. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n10. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n11. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n12. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n13. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n14. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n15. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n16. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n17. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n18. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n19. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n20. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n21. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n22. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n23. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n24. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n25. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n26. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n27. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n28. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n29. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n30. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n31. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n32. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n33. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n34. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n35. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n36. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n37. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n38. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n39. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n40. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n41. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n42. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n43. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n44. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n45. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n46. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n47. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n48. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n49. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n50. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n51. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n52. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n53. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n54. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n55. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n56. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n57. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n58. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n59. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n60. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n61. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n62. Since angle AEB is an inscribed angle that subtends the same arc as angle ADB, we need to subtract this from 180°, as the sum of angles in a triangle is 180°. Therefore, angle AEB = 180° - 90° = 90°.\n\n63. But we need to consider the entire circle, so the angle AEB is actually the supplementary angle to the angle subtended by the arc BE at point A, which is 180° - 90° = 90°.\n\n64. Since angle AEB'][36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:31<2:13:01, 6.26s/it, est. speed input: 72.20 toks/s, output: 23.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:32<58:25, 2.76s/it, est. speed input: 142.00 toks/s, output: 41.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:37<41:07, 1.95s/it, est. speed input: 183.92 toks/s, output: 53.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:38<25:22, 1.21s/it, est. speed input: 243.33 toks/s, output: 73.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:39<17:48, 1.17it/s, est. speed input: 296.42 toks/s, output: 91.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:39<12:08, 1.71it/s, est. speed input: 351.62 toks/s, output: 109.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:40<09:36, 2.16it/s, est. speed input: 394.70 toks/s, output: 130.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:41<07:01, 2.94it/s, est. speed input: 448.22 toks/s, output: 154.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:41<05:11, 3.96it/s, est. speed input: 501.72 toks/s, output: 177.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:41<04:03, 5.06it/s, est. speed input: 549.06 toks/s, output: 200.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:42<03:47, 5.39it/s, est. speed input: 596.27 toks/s, output: 216.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:42<02:48, 7.26it/s, est. speed input: 646.84 toks/s, output: 231.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:42<01:37, 12.44it/s, est. speed input: 754.71 toks/s, output: 272.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:43<01:29, 13.44it/s, est. speed input: 802.15 toks/s, output: 291.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:43<01:27, 13.64it/s, est. speed input: 848.22 toks/s, output: 315.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:43<01:30, 13.14it/s, est. speed input: 895.83 toks/s, output: 330.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:44<01:19, 15.05it/s, est. speed input: 943.61 toks/s, output: 353.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:44<02:00, 9.82it/s, est. speed input: 976.46 toks/s, output: 371.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:45<01:16, 15.36it/s, est. speed input: 1072.95 toks/s, output: 408.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:45<01:20, 14.53it/s, est. speed input: 1114.95 toks/s, output: 433.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:46<01:45, 11.09it/s, est. speed input: 1146.23 toks/s, output: 449.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:46<00:56, 20.31it/s, est. speed input: 1284.17 toks/s, output: 524.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:46<00:42, 27.14it/s, est. speed input: 1386.67 toks/s, output: 565.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:46<00:36, 30.65it/s, est. speed input: 1471.17 toks/s, output: 604.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:47<00:31, 35.78it/s, est. speed input: 1562.74 toks/s, output: 654.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:47<00:31, 35.16it/s, est. speed input: 1650.68 toks/s, output: 696.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:47<00:23, 46.07it/s, est. speed input: 1792.59 toks/s, output: 764.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:47<00:25, 42.17it/s, est. speed input: 1877.83 toks/s, output: 813.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:48<00:38, 27.68it/s, est. speed input: 1945.26 toks/s, output: 851.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:48<00:49, 21.68it/s, est. speed input: 1971.61 toks/s, output: 862.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:49<01:01, 17.44it/s, est. speed input: 1996.44 toks/s, output: 882.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:49<00:52, 20.22it/s, est. speed input: 2050.20 toks/s, output: 906.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:49<00:52, 19.99it/s, est. speed input: 2087.89 toks/s, output: 929.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:50<01:17, 13.59it/s, est. speed input: 2105.33 toks/s, output: 939.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:50<00:42, 24.60it/s, est. speed input: 2235.25 toks/s, output: 1002.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:50<00:34, 29.67it/s, est. speed input: 2313.22 toks/s, output: 1048.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:51<00:29, 33.79it/s, est. speed input: 2510.81 toks/s, output: 1159.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:51<00:28, 34.32it/s, est. speed input: 2548.85 toks/s, output: 1182.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:51<00:23, 41.98it/s, est. speed input: 2668.72 toks/s, output: 1267.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:52<00:26, 36.43it/s, est. speed input: 2777.96 toks/s, output: 1336.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:52<00:29, 32.39it/s, est. speed input: 2837.01 toks/s, output: 1383.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:53<00:29, 32.17it/s, est. speed input: 2869.11 toks/s, output: 1404.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:53<00:32, 29.38it/s, est. speed input: 2898.01 toks/s, output: 1431.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:53<00:28, 32.60it/s, est. speed input: 2970.92 toks/s, output: 1479.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:53<00:19, 47.10it/s, est. speed input: 3097.04 toks/s, output: 1551.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:53<00:17, 50.41it/s, est. speed input: 3207.43 toks/s, output: 1620.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:54<00:12, 68.57it/s, est. speed input: 3375.60 toks/s, output: 1722.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:54<00:10, 84.92it/s, est. speed input: 3538.72 toks/s, output: 1833.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:54<00:08, 105.41it/s, est. speed input: 3703.54 toks/s, output: 1943.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:54<00:08, 94.53it/s, est. speed input: 3819.30 toks/s, output: 2031.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:54<00:08, 93.27it/s, est. speed input: 3933.98 toks/s, output: 2102.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:54<00:10, 79.34it/s, est. speed input: 4043.25 toks/s, output: 2176.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:55<00:08, 91.09it/s, est. speed input: 4194.11 toks/s, output: 2282.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:55<00:07, 106.16it/s, est. speed input: 4352.06 toks/s, output: 2411.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:55<00:06, 109.79it/s, est. speed input: 4467.48 toks/s, output: 2491.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:55<00:06, 106.47it/s, est. speed input: 4584.34 toks/s, output: 2584.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:55<00:06, 111.40it/s, est. speed input: 4700.43 toks/s, output: 2678.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:55<00:08, 87.40it/s, est. speed input: 4805.30 toks/s, output: 2753.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:56<00:06, 98.82it/s, est. speed input: 4994.33 toks/s, output: 2901.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:56<00:06, 95.25it/s, est. speed input: 5107.69 toks/s, output: 2978.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:56<00:08, 72.49it/s, est. speed input: 5228.77 toks/s, output: 3070.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:56<00:08, 74.70it/s, est. speed input: 5301.13 toks/s, output: 3116.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:56<00:08, 77.21it/s, est. speed input: 5368.11 toks/s, output: 3181.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:57<00:07, 82.67it/s, est. speed input: 5478.74 toks/s, output: 3275.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:57<00:07, 84.11it/s, est. speed input: 5546.43 toks/s, output: 3339.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:57<00:04, 118.43it/s, est. speed input: 5737.01 toks/s, output: 3472.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:57<00:05, 94.16it/s, est. speed input: 5833.97 toks/s, output: 3557.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:57<00:04, 115.25it/s, est. speed input: 5992.21 toks/s, output: 3716.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:57<00:04, 118.25it/s, est. speed input: 6099.22 toks/s, output: 3811.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:57<00:05, 94.64it/s, est. speed input: 6225.52 toks/s, output: 3912.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:58<00:04, 101.58it/s, est. speed input: 6333.12 toks/s, output: 4019.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:58<00:03, 120.24it/s, est. speed input: 6477.55 toks/s, output: 4141.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:58<00:02, 149.71it/s, est. speed input: 6696.41 toks/s, output: 4323.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:58<00:02, 140.77it/s, est. speed input: 6916.68 toks/s, output: 4517.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:58<00:02, 147.65it/s, est. speed input: 7064.02 toks/s, output: 4671.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:58<00:02, 155.70it/s, est. speed input: 7244.75 toks/s, output: 4831.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:58<00:01, 172.21it/s, est. speed input: 7431.26 toks/s, output: 5001.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:59<00:01, 174.41it/s, est. speed input: 7569.17 toks/s, output: 5112.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:59<00:01, 168.15it/s, est. speed input: 7739.56 toks/s, output: 5290.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:59<00:01, 161.43it/s, est. speed input: 7914.49 toks/s, output: 5454.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:59<00:01, 139.21it/s, est. speed input: 8045.85 toks/s, output: 5578.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:59<00:01, 127.40it/s, est. speed input: 8172.47 toks/s, output: 5707.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [01:00<00:02, 96.56it/s, est. speed input: 8252.12 toks/s, output: 5791.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [01:00<00:01, 125.40it/s, est. speed input: 8462.41 toks/s, output: 6027.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [01:00<00:01, 126.34it/s, est. speed input: 8565.68 toks/s, output: 6119.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [01:00<00:01, 128.55it/s, est. speed input: 8665.86 toks/s, output: 6218.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [01:00<00:00, 151.41it/s, est. speed input: 8866.85 toks/s, output: 6455.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [01:00<00:00, 176.31it/s, est. speed input: 9083.91 toks/s, output: 6692.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [01:00<00:00, 122.97it/s, est. speed input: 9187.01 toks/s, output: 6787.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [01:01<00:00, 114.42it/s, est. speed input: 9278.74 toks/s, output: 6884.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [01:01<00:00, 93.68it/s, est. speed input: 9370.46 toks/s, output: 6977.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [01:01<00:00, 75.90it/s, est. speed input: 9434.94 toks/s, output: 7063.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [01:01<00:00, 69.03it/s, est. speed input: 9478.31 toks/s, output: 7116.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:02<00:00, 30.18it/s, est. speed input: 9404.41 toks/s, output: 7097.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:07<00:00, 19.00it/s, est. speed input: 8815.63 toks/s, output: 6666.37 toks/s]
[36m(Runner pid=3309020)[0m [score] [0.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["65.0\n\nTo solve this problem, we need to use some fundamental properties of circles and triangles. Here's the step-by-step reasoning:\n\n1. **Understanding the Problem:**\n - AB is the diameter of circle O, so angle AOB is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle.\n - CD is a tangent to the circle at point D, which means that angle ODC is a right angle (90 degrees) because a tangent at any point on a circle is perpendicular to the radius at that point.\n - We are given angle A = 35.0 degrees, and we need to find angle C.\n\n2. **Using the Properties of the Triangle:**\n - Since AB is the diameter, angle AOB is 90 degrees.\n - Since CD is tangent to the circle at D, angle ODC is 90 degrees.\n - Angle AOB is composed of angle AOD and angle BOD, and since angle AOB is 90 degrees, angle AOD must be the remaining part of the circle, which is 90 degrees.\n\n3. **Finding the Remaining Angles:**\n - Since angle AOD is 90 degrees and angle A is 35 degrees, angle AOD can be split into two angles: angle AOD - angle A = 90 degrees - 35 degrees = 55 degrees.\n - Therefore, angle BOD is also 55 degrees because the angles on a straight line add up to 180 degrees, and angle AOB is 90 degrees, so angle BOD is 90 degrees - 35 degrees = 55 degrees.\n\n4. **Finding Angle C:**\n - Angle C is the angle at point C, which is the angle between the tangent CD and the line CB.\n - Since angle ODC is 90 degrees and angle CDB is the same as angle BOD (because they are both angles formed by the tangent and the radius at the point of tangency), angle CDB is 55 degrees.\n - Therefore, angle C is the remaining angle in triangle CDB, which is 180 degrees - 90 degrees - 55 degrees = 35 degrees.\n\nSo, angle C is 35 degrees."]
[36m(Runner pid=3309020)[0m [score] [0.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['10.0\n\n\nGiven that AD is the diameter of circle O, angle ABD is a right angle (90 degrees) because any angle in a semicircle is a right angle. Since EA is a tangent to circle O at point A, angle EAD is also a right angle (90 degrees) because a tangent to a circle is perpendicular to the radius at the point of tangency. \n\nGiven angle EAC = 120 degrees, we can find angle BAC by subtracting angle EAD from angle EAC:\n\nangle BAC = angle EAC - angle EAD\nangle BAC = 120 degrees - 90 degrees\nangle BAC = 30 degrees\n\nSince AD is the diameter, angle ABD is 90 degrees, and angle BAC is 30 degrees, angle ABC can be found by subtracting angle BAC from angle ABD:\n\nangle ABC = angle ABD - angle BAC\nangle ABC = 90 degrees - 30 degrees\nangle ABC = 60 degrees\n\nHowever, the question asks for the degree of angle ABC, and the answer is given as 10 degrees. This suggests that there might be a misunderstanding or a different interpretation of the problem. Given the standard geometric properties and the provided answer, it seems there might be an error in the problem statement or the provided answer. If we consider the standard geometric properties and the given answer, the correct interpretation would be that angle ABC is 10 degrees, not 60 degrees. This discrepancy indicates that the provided answer might be incorrect or there might be a need to re-evaluate the problem statement.']
[36m(Runner pid=3309020)[0m [score] [0.05000000074505806]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Step 0
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.094
[36m(Runner pid=3309020)[0m format_reward: 0.082
[36m(Runner pid=3309020)[0m overall_reward: 0.109
[36m(Runner pid=3309020)[0m reward_score: 0.109
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.186
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m Training Episode 0.
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 1; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.47 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 18.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:30:17 [executor_base.py:219] It took 0.338060 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.89 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.39 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.42 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:30:17 [executor_base.py:219] It took 0.340697 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:31:57 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:31:58 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.50 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:31:58 [executor_base.py:208] It took 0.324141 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.50 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:31:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:31:58 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.50 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:31:58 [executor_base.py:208] It took 0.331483 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.28520599007606506, 'actor/pg_clipfrac': 0.0006317119114100933, 'actor/ppo_kl': -0.002688041655346751}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.6292210817337036, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.4239043891429901, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.04610633850097656, 'actor/pg_clipfrac': 0.0005970149068161845, 'actor/ppo_kl': 0.000180281451321207}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.4506100118160248, 'actor/pg_clipfrac': 0.0014245014172047377, 'actor/ppo_kl': -0.000826333009172231}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4762011766433716, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.14247627556324005, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.9623295068740845, 'actor/pg_clipfrac': 0.0015143866185098886, 'actor/ppo_kl': -0.00042637475416995585}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.48097091913223267, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.18099363148212433, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3373603820800781, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.18029706180095673, 'actor/pg_clipfrac': 0.002439024392515421, 'actor/ppo_kl': 0.0001739626022754237}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.029286982491612434, 'actor/pg_clipfrac': 0.0005665722419507802, 'actor/ppo_kl': -0.0006555151776410639}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.38397544622421265, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005368234706111252}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.18231940269470215, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.1769775152206421, 'actor/pg_clipfrac': 0.0018939394503831863, 'actor/ppo_kl': -0.000724545621778816}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.4181039333343506, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.1021774411201477, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.11459914594888687, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.5654119253158569, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.5865752696990967, 'actor/pg_clipfrac': 0.0014836795162409544, 'actor/ppo_kl': -0.0004546366399154067}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.08099807053804398, 'actor/pg_clipfrac': 0.005621135700494051, 'actor/ppo_kl': -0.0005838287761434913}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.13652192056179047, 'actor/pg_clipfrac': 0.0014316391898319125, 'actor/ppo_kl': 0.0011853222968056798}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.39741820096969604, 'actor/pg_clipfrac': 0.0012626262614503503, 'actor/ppo_kl': 0.00017793972801882774}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.09107475727796555, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.15313516557216644, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 7.4500262599030975e-06, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.07483331859111786, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.248025581240654, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015961288008838892}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.22949303686618805, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00037531278212554753}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.5536091923713684, 'actor/pg_clipfrac': 0.0014958863612264395, 'actor/ppo_kl': 0.0006447237683460116}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.13453714549541473, 'actor/pg_clipfrac': 0.0005497526144608855, 'actor/ppo_kl': 0.0003753125201910734}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.34002986550331116, 'actor/pg_clipfrac': 0.0046439627185463905, 'actor/ppo_kl': -0.0024319433141499758}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.3737601935863495, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.031150925904512405, 'actor/pg_clipfrac': 0.0006169031257741153, 'actor/ppo_kl': -0.0007497680489905179}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.33213362097740173, 'actor/pg_clipfrac': 0.000554323720280081, 'actor/ppo_kl': -0.0005418445216491818}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.02543182298541069, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3075239658355713, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.360609769821167, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.25728291273117065, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3208279013633728, 'actor/pg_clipfrac': 0.0006675567710772157, 'actor/ppo_kl': 0.0011594552779570222}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.6940205693244934, 'actor/pg_clipfrac': 0.0014825797406956553, 'actor/ppo_kl': 0.0005931840278208256}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5224328637123108, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:27<1:55:14, 5.42s/it, est. speed input: 81.14 toks/s, output: 20.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:28<49:33, 2.34s/it, est. speed input: 160.86 toks/s, output: 43.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:40<49:59, 2.37s/it, est. speed input: 169.72 toks/s, output: 51.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:40<30:47, 1.47s/it, est. speed input: 227.50 toks/s, output: 78.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:41<20:41, 1.01it/s, est. speed input: 279.40 toks/s, output: 97.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:41<14:28, 1.44it/s, est. speed input: 331.96 toks/s, output: 119.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:42<10:19, 2.01it/s, est. speed input: 380.28 toks/s, output: 134.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:43<08:18, 2.49it/s, est. speed input: 425.39 toks/s, output: 152.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:43<06:10, 3.34it/s, est. speed input: 476.64 toks/s, output: 176.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:43<04:31, 4.52it/s, est. speed input: 523.59 toks/s, output: 191.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:43<03:18, 6.16it/s, est. speed input: 575.42 toks/s, output: 211.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:44<02:49, 7.18it/s, est. speed input: 622.60 toks/s, output: 234.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:45<03:41, 5.50it/s, est. speed input: 650.82 toks/s, output: 252.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:46<02:48, 7.17it/s, est. speed input: 697.44 toks/s, output: 276.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:46<02:22, 8.48it/s, est. speed input: 746.66 toks/s, output: 298.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:47<02:09, 9.20it/s, est. speed input: 834.91 toks/s, output: 342.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:47<01:48, 11.01it/s, est. speed input: 880.72 toks/s, output: 361.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:47<01:14, 15.81it/s, est. speed input: 973.13 toks/s, output: 399.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:48<01:04, 18.04it/s, est. speed input: 1062.01 toks/s, output: 443.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:48<00:51, 22.43it/s, est. speed input: 1151.06 toks/s, output: 484.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:48<00:54, 21.08it/s, est. speed input: 1230.75 toks/s, output: 528.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:49<00:35, 32.27it/s, est. speed input: 1372.04 toks/s, output: 599.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:49<00:47, 23.90it/s, est. speed input: 1445.96 toks/s, output: 645.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:50<00:47, 23.54it/s, est. speed input: 1482.35 toks/s, output: 666.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:50<00:53, 20.87it/s, est. speed input: 1553.35 toks/s, output: 712.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:50<00:43, 25.38it/s, est. speed input: 1641.47 toks/s, output: 764.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:51<00:36, 29.71it/s, est. speed input: 1723.71 toks/s, output: 806.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:51<00:32, 33.68it/s, est. speed input: 1809.44 toks/s, output: 856.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:51<00:30, 35.58it/s, est. speed input: 1852.80 toks/s, output: 876.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:51<00:30, 34.41it/s, est. speed input: 1937.56 toks/s, output: 931.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:51<00:39, 26.82it/s, est. speed input: 1966.61 toks/s, output: 956.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:52<00:30, 34.17it/s, est. speed input: 2050.07 toks/s, output: 1009.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:52<00:21, 48.31it/s, est. speed input: 2217.54 toks/s, output: 1091.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:52<00:22, 45.05it/s, est. speed input: 2333.22 toks/s, output: 1168.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:52<00:15, 65.15it/s, est. speed input: 2547.82 toks/s, output: 1309.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:53<00:14, 67.10it/s, est. speed input: 2628.90 toks/s, output: 1345.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:53<00:12, 78.97it/s, est. speed input: 2786.42 toks/s, output: 1450.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:53<00:16, 57.88it/s, est. speed input: 2867.83 toks/s, output: 1499.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:53<00:13, 68.58it/s, est. speed input: 2990.77 toks/s, output: 1575.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:53<00:13, 70.51it/s, est. speed input: 3064.41 toks/s, output: 1627.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:54<00:18, 50.44it/s, est. speed input: 3130.07 toks/s, output: 1678.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:54<00:17, 53.00it/s, est. speed input: 3204.07 toks/s, output: 1733.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:54<00:18, 49.48it/s, est. speed input: 3276.86 toks/s, output: 1781.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:54<00:20, 43.66it/s, est. speed input: 3349.60 toks/s, output: 1844.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:55<00:16, 51.51it/s, est. speed input: 3429.43 toks/s, output: 1898.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:55<00:14, 60.74it/s, est. speed input: 3542.57 toks/s, output: 1978.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:55<00:12, 67.43it/s, est. speed input: 3617.42 toks/s, output: 2028.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:55<00:09, 83.83it/s, est. speed input: 3736.33 toks/s, output: 2122.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:55<00:08, 91.38it/s, est. speed input: 3849.17 toks/s, output: 2177.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:55<00:08, 91.20it/s, est. speed input: 3969.58 toks/s, output: 2246.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:55<00:08, 91.85it/s, est. speed input: 4083.88 toks/s, output: 2342.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:56<00:07, 109.04it/s, est. speed input: 4237.46 toks/s, output: 2472.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:56<00:06, 111.73it/s, est. speed input: 4353.90 toks/s, output: 2574.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:56<00:06, 114.26it/s, est. speed input: 4469.71 toks/s, output: 2666.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:56<00:05, 127.03it/s, est. speed input: 4624.87 toks/s, output: 2789.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:56<00:05, 125.91it/s, est. speed input: 4734.00 toks/s, output: 2865.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:56<00:09, 74.45it/s, est. speed input: 4818.87 toks/s, output: 2940.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:57<00:08, 79.15it/s, est. speed input: 4925.21 toks/s, output: 3002.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:57<00:08, 75.82it/s, est. speed input: 4995.74 toks/s, output: 3061.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:57<00:08, 76.74it/s, est. speed input: 5067.22 toks/s, output: 3115.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:57<00:07, 87.41it/s, est. speed input: 5179.97 toks/s, output: 3211.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:57<00:08, 78.27it/s, est. speed input: 5242.11 toks/s, output: 3271.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:57<00:05, 105.28it/s, est. speed input: 5425.49 toks/s, output: 3431.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:57<00:05, 106.12it/s, est. speed input: 5529.48 toks/s, output: 3509.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:58<00:05, 107.44it/s, est. speed input: 5637.59 toks/s, output: 3601.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:58<00:04, 114.33it/s, est. speed input: 5819.83 toks/s, output: 3747.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:58<00:04, 111.21it/s, est. speed input: 5922.97 toks/s, output: 3827.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:58<00:03, 133.62it/s, est. speed input: 6133.19 toks/s, output: 3999.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:58<00:03, 126.17it/s, est. speed input: 6238.92 toks/s, output: 4087.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:58<00:04, 100.84it/s, est. speed input: 6330.99 toks/s, output: 4171.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:59<00:04, 96.01it/s, est. speed input: 6429.73 toks/s, output: 4266.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:59<00:04, 92.60it/s, est. speed input: 6528.80 toks/s, output: 4347.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:59<00:04, 100.24it/s, est. speed input: 6636.00 toks/s, output: 4452.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:59<00:03, 118.22it/s, est. speed input: 6808.95 toks/s, output: 4615.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:59<00:03, 117.20it/s, est. speed input: 6911.82 toks/s, output: 4700.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:59<00:03, 113.86it/s, est. speed input: 7013.14 toks/s, output: 4786.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [01:00<00:04, 84.12it/s, est. speed input: 7097.01 toks/s, output: 4873.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [01:00<00:03, 105.07it/s, est. speed input: 7233.08 toks/s, output: 5026.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [01:00<00:02, 108.93it/s, est. speed input: 7368.90 toks/s, output: 5144.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [01:00<00:03, 96.25it/s, est. speed input: 7455.47 toks/s, output: 5237.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [01:00<00:02, 99.64it/s, est. speed input: 7551.81 toks/s, output: 5351.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [01:00<00:02, 128.60it/s, est. speed input: 7720.97 toks/s, output: 5496.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [01:00<00:01, 125.40it/s, est. speed input: 7820.45 toks/s, output: 5593.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [01:01<00:01, 153.24it/s, est. speed input: 8028.88 toks/s, output: 5793.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [01:01<00:01, 114.09it/s, est. speed input: 8141.06 toks/s, output: 5934.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [01:01<00:01, 114.23it/s, est. speed input: 8240.50 toks/s, output: 6023.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [01:01<00:01, 91.74it/s, est. speed input: 8319.99 toks/s, output: 6103.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [01:01<00:01, 111.54it/s, est. speed input: 8500.05 toks/s, output: 6281.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [01:02<00:01, 119.56it/s, est. speed input: 8635.21 toks/s, output: 6435.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [01:02<00:00, 133.70it/s, est. speed input: 8762.07 toks/s, output: 6598.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [01:02<00:00, 120.95it/s, est. speed input: 8855.57 toks/s, output: 6695.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [01:02<00:00, 115.80it/s, est. speed input: 8949.35 toks/s, output: 6781.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [01:02<00:00, 110.65it/s, est. speed input: 9037.37 toks/s, output: 6862.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [01:03<00:00, 56.99it/s, est. speed input: 9064.64 toks/s, output: 6932.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [01:03<00:00, 45.14it/s, est. speed input: 9081.89 toks/s, output: 6990.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [01:03<00:00, 43.83it/s, est. speed input: 9132.47 toks/s, output: 7049.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [01:04<00:00, 31.93it/s, est. speed input: 9121.54 toks/s, output: 7066.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [01:05<00:00, 20.01it/s, est. speed input: 9046.91 toks/s, output: 7025.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:06<00:00, 16.82it/s, est. speed input: 9009.62 toks/s, output: 7041.16 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:06<00:00, 19.37it/s, est. speed input: 9009.62 toks/s, output: 7041.16 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1470581293106079, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.5248335599899292, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2544544041156769, 'actor/pg_clipfrac': 0.0018433179939165711, 'actor/ppo_kl': -7.45245924917981e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.07004434615373611, 'actor/pg_clipfrac': 0.0039840638637542725, 'actor/ppo_kl': -0.0002466660225763917}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -1.5186715126037598, 'actor/pg_clipfrac': 0.0023419202771037817, 'actor/ppo_kl': 0.0004675717791542411}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -1.05414879322052, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.455810546875, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.14131422340869904, 'actor/pg_clipfrac': 0.0005865102866664529, 'actor/ppo_kl': -0.0005295493174344301}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5553301572799683, 'actor/pg_clipfrac': 0.001855287584476173, 'actor/ppo_kl': 0.0012539797462522984}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5986718535423279, 'actor/pg_clipfrac': 0.001818181830458343, 'actor/ppo_kl': 0.0002922312414739281}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.411940336227417, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.0317796915769577, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.6201398968696594, 'actor/pg_clipfrac': 0.0004952946910634637, 'actor/ppo_kl': 9.529941598884761e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.002198018366470933, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.14720194041728973, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.012032576836645603, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2287016361951828, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.6919295787811279, 'actor/pg_clipfrac': 0.0020020019728690386, 'actor/ppo_kl': -0.0003794669173657894}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.7692444324493408, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005210464005358517}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.13515114784240723, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.2922608256340027, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(Runner pid=3309020)[0m Step 1
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.65
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.001
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.038
[36m(Runner pid=3309020)[0m ppo_kl: -6.451884128697572e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.047
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.047
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.182
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.182
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1045178
[36m(Runner pid=3309020)[0m balanced_min: 1045178
[36m(Runner pid=3309020)[0m max: 1047420
[36m(Runner pid=3309020)[0m mean: 1045178.0
[36m(Runner pid=3309020)[0m min: 1042936
[36m(Runner pid=3309020)[0m minmax_diff: 4484
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 99.342
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 35.551
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 60.178
[36m(Runner pid=3309020)[0m mfu_actor: 0.128
[36m(Runner pid=3309020)[0m throughput: 1162.472
[36m(Runner pid=3309020)[0m time_per_step: 899.1
[36m(Runner pid=3309020)[0m total_num_tokens: 2090356
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 647.0
[36m(Runner pid=3309020)[0m mean: 463.537
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1720.0
[36m(Runner pid=3309020)[0m mean: 353.008
[36m(Runner pid=3309020)[0m min: 5.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.146
[36m(Runner pid=3309020)[0m format: 0.144
[36m(Runner pid=3309020)[0m overall: 0.182
[36m(Runner pid=3309020)[0m tag_reward: 0.331
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 5.03781293852091e-05
[36m(Runner pid=3309020)[0m gen: 0.131
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.049
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.276
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.105
[36m(Runner pid=3309020)[0m gen: 118.739
[36m(Runner pid=3309020)[0m old: 91.778
[36m(Runner pid=3309020)[0m ref: 103.093
[36m(Runner pid=3309020)[0m reward: 7.443
[36m(Runner pid=3309020)[0m step: 899.1
[36m(Runner pid=3309020)[0m update_actor: 577.495
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 2; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:45:16 [executor_base.py:219] It took 0.339930 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.75 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:46:55 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:45:16 [executor_base.py:219] It took 0.340209 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:46:55 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 21:46:55 [executor_base.py:208] It took 0.325657 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.84 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:47:25 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:47:25 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 21:47:25 [executor_base.py:208] It took 0.327264 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.267455130815506, 'actor/pg_clipfrac': 0.0012143290368840098, 'actor/ppo_kl': 0.00011696783622028306}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.1431652456521988, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.23090381920337677, 'actor/pg_clipfrac': 0.0012353304773569107, 'actor/ppo_kl': -0.0006318920059129596}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4873155355453491, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.6156367659568787, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009108046069741249}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.5273602604866028, 'actor/pg_clipfrac': 0.0004317789280321449, 'actor/ppo_kl': -0.0005967250326648355}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.25937530398368835, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.15744154155254364, 'actor/pg_clipfrac': 0.0006896551931276917, 'actor/ppo_kl': 0.00021648933761753142}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.05513248220086098, 'actor/pg_clipfrac': 0.0012224939418956637, 'actor/ppo_kl': -0.0007210681214928627}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.7294143438339233, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.01841464452445507, 'actor/pg_clipfrac': 0.0010136847849935293, 'actor/ppo_kl': 5.710590266971849e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5021381974220276, 'actor/pg_clipfrac': 0.001095290295779705, 'actor/ppo_kl': -0.001649079960770905}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1508011817932129, 'actor/pg_clipfrac': 0.0010706637986004353, 'actor/ppo_kl': -0.00045364469406194985}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.5737313628196716, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000212627332075499}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2587641477584839, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.39115411043167114, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.3155258893966675, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006891454104334116}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.3730555772781372, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015070420922711492}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.5090009570121765, 'actor/pg_clipfrac': 0.0013404826167970896, 'actor/ppo_kl': 0.00043364567682147026}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.4090682566165924, 'actor/pg_clipfrac': 0.0046260603703558445, 'actor/ppo_kl': 0.0011024901177734137}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.01004754938185215, 'actor/pg_clipfrac': 0.0017513134516775608, 'actor/ppo_kl': -0.000585415749810636}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.6619983315467834, 'actor/pg_clipfrac': 0.0017652250826358795, 'actor/ppo_kl': 0.00011089893814641982}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.30058154463768005, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006828453624621034}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.4931051433086395, 'actor/pg_clipfrac': 0.0007616146467626095, 'actor/ppo_kl': 0.00024037822731770575}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.34412768483161926, 'actor/pg_clipfrac': 0.0006451613153330982, 'actor/ppo_kl': -0.0006601899513043463}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.6307560205459595, 'actor/pg_clipfrac': 0.0009328357991762459, 'actor/ppo_kl': -0.00011344098311383277}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.8213815093040466, 'actor/pg_clipfrac': 0.0027137042488902807, 'actor/ppo_kl': -0.0016190549358725548}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.49893900752067566, 'actor/pg_clipfrac': 0.000861326465383172, 'actor/ppo_kl': 0.0016509400447830558}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0015124513301998377, 'actor/pg_clipfrac': 0.000598444021306932, 'actor/ppo_kl': -0.0008436256903223693}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.24493512511253357, 'actor/pg_clipfrac': 0.0024429967161267996, 'actor/ppo_kl': 0.00015468394849449396}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.5906585454940796, 'actor/pg_clipfrac': 0.002270147670060396, 'actor/ppo_kl': -0.0010539654176682234}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.9698663949966431, 'actor/pg_clipfrac': 0.0012809564359486103, 'actor/ppo_kl': -0.0001640120317460969}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.8448680639266968, 'actor/pg_clipfrac': 0.0012771391775459051, 'actor/ppo_kl': -0.00145157880615443}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2876925468444824, 'actor/pg_clipfrac': 0.0015432098880410194, 'actor/ppo_kl': 0.0019851909019052982}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.15525475144386292, 'actor/pg_clipfrac': 0.0019543974194675684, 'actor/ppo_kl': 0.0013637654483318329}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.22786486148834229, 'actor/pg_clipfrac': 0.0026362037751823664, 'actor/ppo_kl': -0.0007123477989807725}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.10249242931604385, 'actor/pg_clipfrac': 0.0013513513840734959, 'actor/ppo_kl': -0.0006272360915318131}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4216265082359314, 'actor/pg_clipfrac': 0.0016393442638218403, 'actor/ppo_kl': 0.00025764998281374574}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.05799450725317001, 'actor/pg_clipfrac': 0.002020202111452818, 'actor/ppo_kl': 0.0013185559073463082}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3177967667579651, 'actor/pg_clipfrac': 0.0022539442870765924, 'actor/ppo_kl': 0.001614409382455051}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.486838698387146, 'actor/pg_clipfrac': 0.0027777778450399637, 'actor/ppo_kl': 0.001908383797854185}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.09699483960866928, 'actor/pg_clipfrac': 0.002399520017206669, 'actor/ppo_kl': 0.0012487830827012658}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.11301299184560776, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002704879269003868}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.01437130756676197, 'actor/pg_clipfrac': 0.002518891589716077, 'actor/ppo_kl': 0.000497946108225733}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.08645129203796387, 'actor/pg_clipfrac': 0.001166180707514286, 'actor/ppo_kl': 0.0005718186730518937}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.8257572054862976, 'actor/pg_clipfrac': 0.0006353239878080785, 'actor/ppo_kl': -0.0003662963572423905}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.4265509843826294, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005500681581906974}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.5191723108291626, 'actor/pg_clipfrac': 0.0015847861068323255, 'actor/ppo_kl': -1.3113651675666915e-06}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1502711921930313, 'actor/pg_clipfrac': 0.0021303792018443346, 'actor/ppo_kl': 0.0016310764476656914}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.4719638526439667, 'actor/pg_clipfrac': 0.007276507094502449, 'actor/ppo_kl': 0.003204189008101821}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.064131960272789, 'actor/pg_clipfrac': 0.0017391304718330503, 'actor/ppo_kl': 0.0008821288356557488}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.24103423953056335, 'actor/pg_clipfrac': 0.010554090142250061, 'actor/ppo_kl': 0.0024387440644204617}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.7918423414230347, 'actor/pg_clipfrac': 0.002752293599769473, 'actor/ppo_kl': -0.0004119873046875}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.43230709433555603, 'actor/pg_clipfrac': 0.0030895983800292015, 'actor/ppo_kl': -0.0021059701684862375}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.1875099539756775, 'actor/pg_clipfrac': 0.0045740422792732716, 'actor/ppo_kl': 0.002197982044890523}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.4207003712654114, 'actor/pg_clipfrac': 0.007334963418543339, 'actor/ppo_kl': -0.0031050913967192173}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.6972929239273071, 'actor/pg_clipfrac': 0.005808325484395027, 'actor/ppo_kl': -0.0011933207279071212}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.38066497445106506, 'actor/pg_clipfrac': 0.004703668877482414, 'actor/ppo_kl': 0.0024194582365453243}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.13357695937156677, 'actor/pg_clipfrac': 0.001988071482628584, 'actor/ppo_kl': -0.001116722240112722}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.07746872305870056, 'actor/pg_clipfrac': 0.0019083969527855515, 'actor/ppo_kl': 0.00048610364319756627}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.24869033694267273, 'actor/pg_clipfrac': 0.0034387896303087473, 'actor/ppo_kl': 0.0012510003289207816}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.30456703901290894, 'actor/pg_clipfrac': 0.0030120480805635452, 'actor/ppo_kl': 0.0011058077216148376}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.37756362557411194, 'actor/pg_clipfrac': 0.0023282887414097786, 'actor/ppo_kl': 0.00214961520396173}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.7167063355445862, 'actor/pg_clipfrac': 0.0033030554186552763, 'actor/ppo_kl': 0.00023109001631382853}
[36m(Runner pid=3309020)[0m Step 2
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.49
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.002
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.002
[36m(Runner pid=3309020)[0m pg_loss: 0.02
[36m(Runner pid=3309020)[0m ppo_kl: 0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.032
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.032
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.185
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.185
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1064654
[36m(Runner pid=3309020)[0m balanced_min: 1060905
[36m(Runner pid=3309020)[0m max: 1065156
[36m(Runner pid=3309020)[0m mean: 1062779.5
[36m(Runner pid=3309020)[0m min: 1060403
[36m(Runner pid=3309020)[0m minmax_diff: 4753
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 104.639
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 38.341
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 66.559
[36m(Runner pid=3309020)[0m mfu_actor: 0.129
[36m(Runner pid=3309020)[0m throughput: 1164.759
[36m(Runner pid=3309020)[0m time_per_step: 912.446
[36m(Runner pid=3309020)[0m total_num_tokens: 2125559
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 695.0
[36m(Runner pid=3309020)[0m mean: 466.475
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 5500.0
[36m(Runner pid=3309020)[0m mean: 363.822
[36m(Runner pid=3309020)[0m min: 5.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.139
[36m(Runner pid=3309020)[0m format: 0.155
[36m(Runner pid=3309020)[0m overall: 0.185
[36m(Runner pid=3309020)[0m tag_reward: 0.342
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.663926951102016e-05
[36m(Runner pid=3309020)[0m gen: 0.154
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.27
[36m(Runner pid=3309020)[0m timing_s:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:25<1:47:47, 5.07s/it, est. speed input: 89.11 toks/s, output: 23.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:28<52:23, 2.48s/it, est. speed input: 162.85 toks/s, output: 46.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:32<35:55, 1.70s/it, est. speed input: 216.12 toks/s, output: 65.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:35<26:38, 1.27s/it, est. speed input: 264.30 toks/s, output: 82.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:37<19:43, 1.06it/s, est. speed input: 312.44 toks/s, output: 99.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:38<13:47, 1.51it/s, est. speed input: 367.75 toks/s, output: 120.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:38<05:55, 3.47it/s, est. speed input: 543.95 toks/s, output: 185.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:39<05:57, 3.44it/s, est. speed input: 579.31 toks/s, output: 195.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:40<04:48, 4.24it/s, est. speed input: 632.61 toks/s, output: 217.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:41<04:35, 4.44it/s, est. speed input: 673.13 toks/s, output: 237.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:41<03:39, 5.53it/s, est. speed input: 722.94 toks/s, output: 259.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:42<03:11, 6.31it/s, est. speed input: 765.94 toks/s, output: 284.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:42<02:26, 8.21it/s, est. speed input: 814.79 toks/s, output: 302.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:42<02:27, 8.13it/s, est. speed input: 858.97 toks/s, output: 318.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:42<01:53, 10.51it/s, est. speed input: 911.18 toks/s, output: 340.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:43<02:02, 9.70it/s, est. speed input: 950.51 toks/s, output: 362.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:44<01:58, 9.96it/s, est. speed input: 989.99 toks/s, output: 378.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:44<01:32, 12.74it/s, est. speed input: 1037.84 toks/s, output: 402.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:44<00:48, 23.79it/s, est. speed input: 1189.13 toks/s, output: 487.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:44<00:54, 21.37it/s, est. speed input: 1235.12 toks/s, output: 508.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:45<00:49, 23.26it/s, est. speed input: 1327.61 toks/s, output: 554.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:45<01:05, 17.60it/s, est. speed input: 1361.30 toks/s, output: 579.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:45<01:02, 18.30it/s, est. speed input: 1405.96 toks/s, output: 604.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:46<01:03, 17.90it/s, est. speed input: 1442.94 toks/s, output: 625.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:46<01:10, 15.95it/s, est. speed input: 1479.88 toks/s, output: 653.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:46<00:45, 24.47it/s, est. speed input: 1572.49 toks/s, output: 704.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:47<00:45, 24.58it/s, est. speed input: 1658.23 toks/s, output: 748.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:47<00:42, 26.09it/s, est. speed input: 1740.53 toks/s, output: 802.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:47<00:35, 30.48it/s, est. speed input: 1824.33 toks/s, output: 856.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:47<00:27, 38.96it/s, est. speed input: 1912.74 toks/s, output: 904.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:48<00:31, 34.30it/s, est. speed input: 1989.73 toks/s, output: 943.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:48<00:37, 28.35it/s, est. speed input: 2021.53 toks/s, output: 967.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:48<00:34, 31.04it/s, est. speed input: 2067.74 toks/s, output: 990.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:48<00:36, 29.14it/s, est. speed input: 2106.58 toks/s, output: 1008.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:48<00:18, 56.23it/s, est. speed input: 2290.27 toks/s, output: 1112.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:48<00:16, 64.04it/s, est. speed input: 2377.65 toks/s, output: 1180.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:49<00:24, 41.28it/s, est. speed input: 2447.87 toks/s, output: 1214.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:49<00:21, 46.54it/s, est. speed input: 2535.40 toks/s, output: 1272.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:49<00:22, 44.70it/s, est. speed input: 2615.87 toks/s, output: 1328.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:50<00:22, 43.45it/s, est. speed input: 2694.81 toks/s, output: 1372.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:50<00:26, 36.43it/s, est. speed input: 2764.24 toks/s, output: 1422.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:50<00:22, 42.25it/s, est. speed input: 2882.17 toks/s, output: 1484.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:50<00:24, 39.23it/s, est. speed input: 2919.47 toks/s, output: 1507.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:51<00:24, 38.95it/s, est. speed input: 2998.31 toks/s, output: 1547.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:51<00:38, 24.39it/s, est. speed input: 3007.40 toks/s, output: 1565.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:51<00:36, 25.36it/s, est. speed input: 3042.31 toks/s, output: 1585.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:51<00:24, 38.17it/s, est. speed input: 3162.46 toks/s, output: 1667.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:52<00:25, 36.34it/s, est. speed input: 3200.45 toks/s, output: 1691.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:52<00:27, 33.02it/s, est. speed input: 3233.98 toks/s, output: 1711.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:52<00:25, 35.73it/s, est. speed input: 3310.33 toks/s, output: 1767.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:52<00:20, 44.46it/s, est. speed input: 3387.79 toks/s, output: 1818.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:52<00:19, 46.16it/s, est. speed input: 3464.29 toks/s, output: 1871.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:53<00:19, 45.34it/s, est. speed input: 3537.06 toks/s, output: 1913.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:53<00:21, 40.75it/s, est. speed input: 3567.80 toks/s, output: 1927.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:53<00:14, 59.70it/s, est. speed input: 3685.25 toks/s, output: 2002.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:53<00:10, 77.31it/s, est. speed input: 3808.21 toks/s, output: 2110.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:53<00:12, 67.25it/s, est. speed input: 3881.34 toks/s, output: 2156.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:53<00:11, 73.89it/s, est. speed input: 3958.42 toks/s, output: 2211.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:53<00:10, 74.67it/s, est. speed input: 4037.81 toks/s, output: 2275.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:54<00:08, 91.04it/s, est. speed input: 4198.97 toks/s, output: 2374.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:54<00:08, 87.16it/s, est. speed input: 4274.43 toks/s, output: 2431.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:54<00:10, 70.55it/s, est. speed input: 4345.67 toks/s, output: 2497.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:54<00:10, 74.19it/s, est. speed input: 4459.27 toks/s, output: 2571.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:54<00:09, 76.16it/s, est. speed input: 4534.85 toks/s, output: 2622.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:54<00:08, 88.55it/s, est. speed input: 4651.30 toks/s, output: 2709.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:55<00:07, 97.67it/s, est. speed input: 4808.36 toks/s, output: 2817.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:55<00:07, 98.23it/s, est. speed input: 4924.91 toks/s, output: 2895.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:55<00:05, 120.17it/s, est. speed input: 5090.94 toks/s, output: 3014.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:55<00:05, 114.09it/s, est. speed input: 5208.64 toks/s, output: 3116.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:55<00:04, 127.60it/s, est. speed input: 5363.69 toks/s, output: 3257.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:55<00:05, 109.06it/s, est. speed input: 5474.48 toks/s, output: 3332.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:55<00:04, 124.13it/s, est. speed input: 5628.03 toks/s, output: 3443.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:56<00:03, 148.56it/s, est. speed input: 5817.60 toks/s, output: 3592.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:56<00:04, 136.47it/s, est. speed input: 5964.73 toks/s, output: 3694.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:56<00:04, 115.18it/s, est. speed input: 6069.09 toks/s, output: 3798.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:56<00:04, 116.65it/s, est. speed input: 6175.77 toks/s, output: 3876.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:56<00:06, 80.64it/s, est. speed input: 6262.01 toks/s, output: 3963.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:56<00:05, 92.50it/s, est. speed input: 6366.96 toks/s, output: 4063.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:57<00:04, 109.42it/s, est. speed input: 6513.85 toks/s, output: 4184.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:57<00:03, 115.21it/s, est. speed input: 6720.04 toks/s, output: 4346.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:57<00:04, 93.08it/s, est. speed input: 6812.48 toks/s, output: 4417.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:57<00:04, 94.96it/s, est. speed input: 6911.67 toks/s, output: 4518.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:57<00:03, 108.51it/s, est. speed input: 7060.93 toks/s, output: 4644.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:57<00:03, 114.01it/s, est. speed input: 7168.82 toks/s, output: 4730.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:58<00:03, 119.99it/s, est. speed input: 7281.01 toks/s, output: 4840.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:58<00:03, 105.58it/s, est. speed input: 7381.62 toks/s, output: 4927.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:58<00:03, 108.84it/s, est. speed input: 7488.61 toks/s, output: 5009.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:58<00:03, 102.35it/s, est. speed input: 7585.83 toks/s, output: 5122.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:58<00:02, 106.86it/s, est. speed input: 7708.35 toks/s, output: 5220.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:58<00:03, 90.07it/s, est. speed input: 7792.95 toks/s, output: 5316.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:59<00:01, 130.53it/s, est. speed input: 8059.21 toks/s, output: 5579.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:59<00:01, 141.46it/s, est. speed input: 8197.60 toks/s, output: 5720.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:59<00:01, 130.01it/s, est. speed input: 8325.28 toks/s, output: 5854.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:59<00:01, 135.77it/s, est. speed input: 8456.76 toks/s, output: 6000.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:59<00:01, 111.23it/s, est. speed input: 8550.07 toks/s, output: 6092.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:59<00:01, 115.66it/s, est. speed input: 8654.16 toks/s, output: 6207.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:59<00:01, 108.48it/s, est. speed input: 8748.61 toks/s, output: 6321.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [01:00<00:01, 123.34it/s, est. speed input: 8899.07 toks/s, output: 6456.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [01:00<00:01, 114.50it/s, est. speed input: 8991.65 toks/s, output: 6569.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [01:00<00:00, 113.65it/s, est. speed input: 9083.90 toks/s, output: 6659.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [01:00<00:01, 75.14it/s, est. speed input: 9135.33 toks/s, output: 6718.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [01:00<00:01, 74.72it/s, est. speed input: 9192.64 toks/s, output: 6810.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [01:01<00:01, 58.48it/s, est. speed input: 9275.05 toks/s, output: 6908.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [01:01<00:00, 55.03it/s, est. speed input: 9363.06 toks/s, output: 6971.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [01:01<00:00, 54.73it/s, est. speed input: 9413.51 toks/s, output: 7024.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [01:01<00:00, 66.70it/s, est. speed input: 9533.23 toks/s, output: 7163.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [01:02<00:00, 34.19it/s, est. speed input: 9497.99 toks/s, output: 7164.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:06<00:00, 8.06it/s, est. speed input: 8967.62 toks/s, output: 6807.59 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:06<00:00, 19.12it/s, est. speed input: 8967.62 toks/s, output: 6807.59 toks/s]
[36m(Runner pid=3309020)[0m adv: 0.163
[36m(Runner pid=3309020)[0m gen: 143.35
[36m(Runner pid=3309020)[0m old: 93.367
[36m(Runner pid=3309020)[0m ref: 93.79
[36m(Runner pid=3309020)[0m reward: 7.256
[36m(Runner pid=3309020)[0m step: 912.446
[36m(Runner pid=3309020)[0m update_actor: 573.223
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 3; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.04 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:00:33 [executor_base.py:219] It took 0.341343 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.96 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.65 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.78 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:02:11 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:00:33 [executor_base.py:219] It took 0.340851 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:02:12 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.86 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:02:12 [executor_base.py:208] It took 0.325849 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.86 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:02:14 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:02:14 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:02:14 [executor_base.py:208] It took 0.330176 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.44260290265083313, 'actor/pg_clipfrac': 0.0015600624028593302, 'actor/ppo_kl': 0.001072556246072054}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.10825995355844498, 'actor/pg_clipfrac': 0.0017910447204485536, 'actor/ppo_kl': -0.00022379121219273657}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.5061581134796143, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005413625622168183}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.46028441190719604, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.2835262417793274, 'actor/pg_clipfrac': 0.0011229646624997258, 'actor/ppo_kl': 0.001380419242195785}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3342105448246002, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0980956181883812, 'actor/pg_clipfrac': 0.0004752851673401892, 'actor/ppo_kl': 0.0007174214697442949}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2677910029888153, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2618477940559387, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.3849707245826721, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.17878788709640503, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.046590656042099, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.6027724146842957, 'actor/pg_clipfrac': 0.0011383038945496082, 'actor/ppo_kl': -0.001303959870710969}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.05729365348815918, 'actor/pg_clipfrac': 0.002504173666238785, 'actor/ppo_kl': 5.793372474727221e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.5259340405464172, 'actor/pg_clipfrac': 0.0004868549294769764, 'actor/ppo_kl': 0.0006146584055386484}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.6887446045875549, 'actor/pg_clipfrac': 0.0009751340840011835, 'actor/ppo_kl': -0.0011033565970137715}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.4941164553165436, 'actor/pg_clipfrac': 0.00712105818092823, 'actor/ppo_kl': 0.0024725506082177162}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.563158392906189, 'actor/pg_clipfrac': 0.005151320248842239, 'actor/ppo_kl': -0.0020070276223123074}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3348384201526642, 'actor/pg_clipfrac': 0.004233700223267078, 'actor/ppo_kl': -0.0028875868301838636}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.05533921346068382, 'actor/pg_clipfrac': 0.006818181835114956, 'actor/ppo_kl': -0.00017408891289960593}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.004141626879572868, 'actor/pg_clipfrac': 0.0018099547596648335, 'actor/ppo_kl': -0.0006741182878613472}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.49869394302368164, 'actor/pg_clipfrac': 0.004672897048294544, 'actor/ppo_kl': 0.0037235489580780268}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.7473983764648438, 'actor/pg_clipfrac': 0.002945508109405637, 'actor/ppo_kl': 0.003746434347704053}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.45113226771354675, 'actor/pg_clipfrac': 0.0016251354245468974, 'actor/ppo_kl': -0.0001535828923806548}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.016702968627214432, 'actor/pg_clipfrac': 0.0030075188260525465, 'actor/ppo_kl': -0.002679371740669012}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.14327189326286316, 'actor/pg_clipfrac': 0.004413062706589699, 'actor/ppo_kl': 0.00010448499233461916}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.20839686691761017, 'actor/pg_clipfrac': 0.006051436997950077, 'actor/ppo_kl': 0.002917398000136018}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.41379010677337646, 'actor/pg_clipfrac': 0.0033191086258739233, 'actor/ppo_kl': 0.002544260583817959}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.1022975966334343, 'actor/pg_clipfrac': 0.0043196543119847775, 'actor/ppo_kl': -0.0013993121683597565}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.6211839318275452, 'actor/pg_clipfrac': 0.0025662959087640047, 'actor/ppo_kl': 0.0020749978721141815}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.17007243633270264, 'actor/pg_clipfrac': 0.005067567341029644, 'actor/ppo_kl': -0.002011863049119711}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.6477664709091187, 'actor/pg_clipfrac': 0.0038797284942120314, 'actor/ppo_kl': 0.001433611148968339}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.5088275074958801, 'actor/pg_clipfrac': 0.008686210960149765, 'actor/ppo_kl': 0.004165755119174719}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.40815797448158264, 'actor/pg_clipfrac': 0.0032916392665356398, 'actor/ppo_kl': 0.00247209588997066}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.6919206976890564, 'actor/pg_clipfrac': 0.0023497818037867546, 'actor/ppo_kl': 0.00030149234225973487}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.16972209513187408, 'actor/pg_clipfrac': 0.0006915629492141306, 'actor/ppo_kl': 0.0008240801398642361}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5322192907333374, 'actor/pg_clipfrac': 0.00592216569930315, 'actor/ppo_kl': 0.0038543352857232094}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.15981392562389374, 'actor/pg_clipfrac': 0.0068292682990431786, 'actor/ppo_kl': 0.0029072086326777935}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.3670665919780731, 'actor/pg_clipfrac': 0.006933744065463543, 'actor/ppo_kl': 0.003592538181692362}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.8929972052574158, 'actor/pg_clipfrac': 0.006139677483588457, 'actor/ppo_kl': 0.002485834527760744}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.621065616607666, 'actor/pg_clipfrac': 0.005946481600403786, 'actor/ppo_kl': 0.0021248809061944485}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.01638084650039673, 'actor/pg_clipfrac': 0.004994450602680445, 'actor/ppo_kl': -0.000818432483356446}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3273051679134369, 'actor/pg_clipfrac': 0.010086455382406712, 'actor/ppo_kl': 0.00417046993970871}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.22821196913719177, 'actor/pg_clipfrac': 0.0004810004902537912, 'actor/ppo_kl': 0.0013756286352872849}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.526353120803833, 'actor/pg_clipfrac': 0.0069551775231957436, 'actor/ppo_kl': 0.006093022413551807}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.20577336847782135, 'actor/pg_clipfrac': 0.006456241011619568, 'actor/ppo_kl': 0.0033476375974714756}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.32422420382499695, 'actor/pg_clipfrac': 0.00533617939800024, 'actor/ppo_kl': 0.006448511499911547}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.5650103688240051, 'actor/pg_clipfrac': 0.007646559271961451, 'actor/ppo_kl': 0.002759072929620743}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.4715001881122589, 'actor/pg_clipfrac': 0.002230151556432247, 'actor/ppo_kl': 0.000716063950676471}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.16605432331562042, 'actor/pg_clipfrac': 0.002689979737624526, 'actor/ppo_kl': 0.0008814233588054776}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.3493930697441101, 'actor/pg_clipfrac': 0.008559200912714005, 'actor/ppo_kl': -0.004266823176294565}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.1058506965637207, 'actor/pg_clipfrac': 0.006097560748457909, 'actor/ppo_kl': 0.0006232966552488506}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.21996060013771057, 'actor/pg_clipfrac': 0.005559968296438456, 'actor/ppo_kl': 0.003429365810006857}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.25457659363746643, 'actor/pg_clipfrac': 0.005242464132606983, 'actor/ppo_kl': 0.004258032888174057}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.5390040278434753, 'actor/pg_clipfrac': 0.003944773226976395, 'actor/ppo_kl': 0.004309237468987703}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.693461000919342, 'actor/pg_clipfrac': 0.004135079216212034, 'actor/ppo_kl': 0.0038238284178078175}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.031439151614904404, 'actor/pg_clipfrac': 0.007223113905638456, 'actor/ppo_kl': 0.0018071240046992898}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.25682416558265686, 'actor/pg_clipfrac': 0.006139677483588457, 'actor/ppo_kl': 0.004592090379446745}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3557034432888031, 'actor/pg_clipfrac': 0.0016483516665175557, 'actor/ppo_kl': 0.002347746631130576}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.47077319025993347, 'actor/pg_clipfrac': 0.00920245423913002, 'actor/ppo_kl': 0.016270093619823456}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.2835633158683777, 'actor/pg_clipfrac': 0.010351967066526413, 'actor/ppo_kl': 0.00899518933147192}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.515514075756073, 'actor/pg_clipfrac': 0.006163328420370817, 'actor/ppo_kl': 0.0036990572698414326}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6416219472885132, 'actor/pg_clipfrac': 0.005353318993002176, 'actor/ppo_kl': 0.0026263853069394827}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.023404289036989212, 'actor/pg_clipfrac': 0.01676829345524311, 'actor/ppo_kl': 0.01202253345400095}
[36m(Runner pid=3309020)[0m Step 3
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.512
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.012
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.004
[36m(Runner pid=3309020)[0m pg_loss: 0.055
[36m(Runner pid=3309020)[0m ppo_kl: 0.001
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.071
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.071
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.297
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.297
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1054499
[36m(Runner pid=3309020)[0m balanced_min: 1054499
[36m(Runner pid=3309020)[0m max: 1055927
[36m(Runner pid=3309020)[0m mean: 1054499.0
[36m(Runner pid=3309020)[0m min: 1053071
[36m(Runner pid=3309020)[0m minmax_diff: 2856
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:23<1:40:43, 4.74s/it, est. speed input: 94.09 toks/s, output: 21.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:31<59:47, 2.82s/it, est. speed input: 146.19 toks/s, output: 35.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:33<36:52, 1.75s/it, est. speed input: 202.77 toks/s, output: 54.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:33<22:58, 1.09s/it, est. speed input: 270.42 toks/s, output: 77.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:36<18:23, 1.14it/s, est. speed input: 317.68 toks/s, output: 96.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:37<13:22, 1.56it/s, est. speed input: 375.57 toks/s, output: 117.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:37<09:17, 2.23it/s, est. speed input: 439.81 toks/s, output: 132.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:37<04:59, 4.12it/s, est. speed input: 560.83 toks/s, output: 173.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:38<04:40, 4.39it/s, est. speed input: 604.20 toks/s, output: 197.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:39<03:49, 5.35it/s, est. speed input: 660.77 toks/s, output: 222.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:39<01:51, 10.82it/s, est. speed input: 835.10 toks/s, output: 284.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:39<01:39, 12.09it/s, est. speed input: 886.99 toks/s, output: 306.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:40<01:49, 10.93it/s, est. speed input: 974.60 toks/s, output: 341.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:40<01:40, 11.90it/s, est. speed input: 1025.29 toks/s, output: 357.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:41<01:42, 11.53it/s, est. speed input: 1068.49 toks/s, output: 377.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:41<01:14, 15.84it/s, est. speed input: 1170.91 toks/s, output: 419.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:42<01:44, 11.17it/s, est. speed input: 1196.30 toks/s, output: 436.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:42<01:09, 16.76it/s, est. speed input: 1296.89 toks/s, output: 481.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:42<00:54, 21.20it/s, est. speed input: 1398.16 toks/s, output: 531.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:43<00:51, 22.23it/s, est. speed input: 1448.97 toks/s, output: 553.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:43<00:33, 33.68it/s, est. speed input: 1607.60 toks/s, output: 621.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:43<00:27, 41.26it/s, est. speed input: 1712.84 toks/s, output: 670.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:43<00:36, 30.33it/s, est. speed input: 1794.47 toks/s, output: 714.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:43<00:34, 31.97it/s, est. speed input: 1844.71 toks/s, output: 736.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:44<00:35, 31.27it/s, est. speed input: 1885.71 toks/s, output: 757.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:44<00:44, 24.80it/s, est. speed input: 1922.34 toks/s, output: 774.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:44<00:39, 27.78it/s, est. speed input: 1969.39 toks/s, output: 794.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:44<00:35, 30.72it/s, est. speed input: 2018.08 toks/s, output: 821.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:45<00:39, 27.17it/s, est. speed input: 2118.36 toks/s, output: 859.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:45<00:23, 45.13it/s, est. speed input: 2362.35 toks/s, output: 989.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:45<00:19, 52.13it/s, est. speed input: 2465.55 toks/s, output: 1037.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:45<00:16, 62.36it/s, est. speed input: 2606.86 toks/s, output: 1114.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:45<00:17, 58.38it/s, est. speed input: 2692.12 toks/s, output: 1157.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:46<00:18, 55.74it/s, est. speed input: 2781.39 toks/s, output: 1197.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:46<00:14, 66.23it/s, est. speed input: 2919.70 toks/s, output: 1282.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:46<00:18, 53.05it/s, est. speed input: 3000.40 toks/s, output: 1337.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:46<00:18, 52.57it/s, est. speed input: 3084.45 toks/s, output: 1375.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:46<00:14, 64.37it/s, est. speed input: 3222.99 toks/s, output: 1446.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:47<00:14, 66.83it/s, est. speed input: 3309.34 toks/s, output: 1502.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:47<00:11, 78.47it/s, est. speed input: 3450.59 toks/s, output: 1586.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:47<00:13, 66.71it/s, est. speed input: 3533.10 toks/s, output: 1619.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:47<00:15, 59.95it/s, est. speed input: 3610.12 toks/s, output: 1658.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:47<00:16, 56.04it/s, est. speed input: 3691.50 toks/s, output: 1698.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:47<00:15, 57.22it/s, est. speed input: 3771.02 toks/s, output: 1747.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:48<00:15, 55.39it/s, est. speed input: 3894.54 toks/s, output: 1822.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:48<00:17, 50.84it/s, est. speed input: 3970.09 toks/s, output: 1866.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:48<00:10, 80.37it/s, est. speed input: 4199.05 toks/s, output: 1984.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:48<00:10, 76.22it/s, est. speed input: 4279.26 toks/s, output: 2041.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:49<00:15, 52.39it/s, est. speed input: 4338.87 toks/s, output: 2082.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:49<00:16, 48.95it/s, est. speed input: 4409.87 toks/s, output: 2133.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:49<00:13, 60.91it/s, est. speed input: 4533.20 toks/s, output: 2217.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:49<00:12, 63.87it/s, est. speed input: 4616.83 toks/s, output: 2270.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:49<00:09, 79.86it/s, est. speed input: 4745.58 toks/s, output: 2346.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:49<00:11, 66.35it/s, est. speed input: 4816.82 toks/s, output: 2402.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:50<00:10, 69.11it/s, est. speed input: 4913.57 toks/s, output: 2450.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:50<00:08, 85.67it/s, est. speed input: 5079.42 toks/s, output: 2570.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:50<00:08, 88.60it/s, est. speed input: 5196.78 toks/s, output: 2653.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:50<00:08, 81.35it/s, est. speed input: 5272.33 toks/s, output: 2704.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:50<00:09, 76.07it/s, est. speed input: 5346.33 toks/s, output: 2762.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:50<00:07, 87.27it/s, est. speed input: 5515.08 toks/s, output: 2859.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:51<00:11, 57.67it/s, est. speed input: 5559.79 toks/s, output: 2885.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:51<00:07, 84.71it/s, est. speed input: 5777.40 toks/s, output: 3031.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:51<00:07, 82.37it/s, est. speed input: 5897.36 toks/s, output: 3103.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:51<00:07, 79.29it/s, est. speed input: 5965.19 toks/s, output: 3158.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:51<00:08, 71.18it/s, est. speed input: 6029.67 toks/s, output: 3203.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:52<00:08, 70.95it/s, est. speed input: 6102.17 toks/s, output: 3254.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:52<00:06, 84.40it/s, est. speed input: 6219.63 toks/s, output: 3343.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:52<00:06, 81.79it/s, est. speed input: 6290.00 toks/s, output: 3407.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:52<00:05, 107.99it/s, est. speed input: 6492.69 toks/s, output: 3542.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:52<00:04, 107.32it/s, est. speed input: 6606.37 toks/s, output: 3643.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:52<00:04, 116.98it/s, est. speed input: 6722.87 toks/s, output: 3721.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:52<00:03, 131.39it/s, est. speed input: 6881.88 toks/s, output: 3821.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:52<00:03, 124.41it/s, est. speed input: 6995.10 toks/s, output: 3908.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:53<00:04, 94.17it/s, est. speed input: 7095.29 toks/s, output: 3983.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:53<00:04, 91.37it/s, est. speed input: 7203.28 toks/s, output: 4081.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:53<00:05, 82.18it/s, est. speed input: 7308.57 toks/s, output: 4163.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:53<00:05, 78.68it/s, est. speed input: 7406.86 toks/s, output: 4261.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:53<00:03, 104.12it/s, est. speed input: 7608.61 toks/s, output: 4438.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:54<00:03, 112.17it/s, est. speed input: 7755.75 toks/s, output: 4559.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:54<00:03, 104.96it/s, est. speed input: 7864.36 toks/s, output: 4657.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:54<00:03, 98.26it/s, est. speed input: 7963.64 toks/s, output: 4755.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:54<00:02, 126.12it/s, est. speed input: 8171.33 toks/s, output: 4917.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:54<00:02, 128.90it/s, est. speed input: 8283.14 toks/s, output: 5023.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:54<00:02, 131.81it/s, est. speed input: 8398.93 toks/s, output: 5127.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:54<00:02, 135.07it/s, est. speed input: 8504.61 toks/s, output: 5218.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:55<00:01, 138.91it/s, est. speed input: 8620.63 toks/s, output: 5314.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:55<00:01, 127.14it/s, est. speed input: 8761.37 toks/s, output: 5443.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:55<00:01, 151.08it/s, est. speed input: 8959.22 toks/s, output: 5637.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:55<00:01, 119.69it/s, est. speed input: 9076.71 toks/s, output: 5747.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:55<00:01, 114.63it/s, est. speed input: 9177.74 toks/s, output: 5842.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:55<00:01, 118.36it/s, est. speed input: 9312.29 toks/s, output: 5992.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:56<00:01, 109.73it/s, est. speed input: 9410.10 toks/s, output: 6106.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:56<00:01, 114.37it/s, est. speed input: 9548.33 toks/s, output: 6262.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:56<00:00, 112.86it/s, est. speed input: 9649.84 toks/s, output: 6359.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:56<00:00, 130.72it/s, est. speed input: 9805.71 toks/s, output: 6507.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:56<00:00, 113.14it/s, est. speed input: 9895.99 toks/s, output: 6591.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:56<00:00, 89.40it/s, est. speed input: 9970.66 toks/s, output: 6679.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:57<00:00, 104.57it/s, est. speed input: 10116.56 toks/s, output: 6831.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:57<00:00, 72.90it/s, est. speed input: 10182.33 toks/s, output: 6936.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:57<00:00, 44.77it/s, est. speed input: 10161.69 toks/s, output: 6966.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:59<00:00, 20.81it/s, est. speed input: 10008.26 toks/s, output: 6904.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:12<00:00, 17.56it/s, est. speed input: 8176.65 toks/s, output: 5696.00 toks/s]
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.039
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 38.851
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 70.736
[36m(Runner pid=3309020)[0m mfu_actor: 0.128
[36m(Runner pid=3309020)[0m throughput: 1203.673
[36m(Runner pid=3309020)[0m time_per_step: 876.067
[36m(Runner pid=3309020)[0m total_num_tokens: 2108998
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 468.115
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1856.0
[36m(Runner pid=3309020)[0m mean: 355.712
[36m(Runner pid=3309020)[0m min: 8.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.172
[36m(Runner pid=3309020)[0m format: 0.325
[36m(Runner pid=3309020)[0m overall: 0.297
[36m(Runner pid=3309020)[0m tag_reward: 0.565
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.123228235692123e-05
[36m(Runner pid=3309020)[0m gen: 0.127
[36m(Runner pid=3309020)[0m old: 0.043
[36m(Runner pid=3309020)[0m ref: 0.043
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.27
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.171
[36m(Runner pid=3309020)[0m gen: 115.413
[36m(Runner pid=3309020)[0m old: 90.975
[36m(Runner pid=3309020)[0m ref: 91.352
[36m(Runner pid=3309020)[0m reward: 7.037
[36m(Runner pid=3309020)[0m step: 876.067
[36m(Runner pid=3309020)[0m update_actor: 570.474
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 4; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:15:11 [executor_base.py:219] It took 0.339483 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:16:45 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:15:11 [executor_base.py:219] It took 0.340233 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:16:45 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:16:45 [executor_base.py:208] It took 0.327693 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:16:55 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:16:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:16:56 [executor_base.py:208] It took 0.326273 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.5530420541763306, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.6482361555099487, 'actor/pg_clipfrac': 0.0009551098337396979, 'actor/ppo_kl': 0.000965118408203125}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.12389370054006577, 'actor/pg_clipfrac': 0.000613496929872781, 'actor/ppo_kl': -0.0010183814447373152}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5579662919044495, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002878549857996404}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.24839183688163757, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025712576461955905}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.11558978259563446, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3908542990684509, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.3990631103515625, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.10241230577230453, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.5390103459358215, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3254619240760803, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.22109635174274445, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018324685515835881}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.4192133843898773, 'actor/pg_clipfrac': 0.0005592841189354658, 'actor/ppo_kl': -0.0006345443543978035}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.6371535062789917, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009666048572398722}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.47355663776397705, 'actor/pg_clipfrac': 0.0013704887824133039, 'actor/ppo_kl': -0.0010478750336915255}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.17147429287433624, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.4455115497112274, 'actor/pg_clipfrac': 0.0016163793625310063, 'actor/ppo_kl': -0.0009108823142014444}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.35663437843322754, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008978904224932194}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.4406507611274719, 'actor/pg_clipfrac': 0.003021148033440113, 'actor/ppo_kl': -0.0015230592107400298}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5387069582939148, 'actor/pg_clipfrac': 0.0011507479939609766, 'actor/ppo_kl': 0.0006313796038739383}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.09111780673265457, 'actor/pg_clipfrac': 0.000830564764328301, 'actor/ppo_kl': -0.000376731448341161}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.33642521500587463, 'actor/pg_clipfrac': 0.001523229293525219, 'actor/ppo_kl': 0.0008003266993910074}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.23299787938594818, 'actor/pg_clipfrac': 0.00256739417091012, 'actor/ppo_kl': 3.644529351731762e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.09820349514484406, 'actor/pg_clipfrac': 0.004032257944345474, 'actor/ppo_kl': 0.0007227851310744882}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.023719768971204758, 'actor/pg_clipfrac': 0.002074688905850053, 'actor/ppo_kl': 0.0027411794289946556}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.26715344190597534, 'actor/pg_clipfrac': 0.0016051364364102483, 'actor/ppo_kl': -1.4618924069509376e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.5193787217140198, 'actor/pg_clipfrac': 0.004347825888544321, 'actor/ppo_kl': 3.453959607213619e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.9520270824432373, 'actor/pg_clipfrac': 0.0009259259095415473, 'actor/ppo_kl': -0.0004601796390488744}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.20249122381210327, 'actor/pg_clipfrac': 0.0013568521244451404, 'actor/ppo_kl': -0.0015970589593052864}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.2559998333454132, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008238509180955589}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.32622984051704407, 'actor/pg_clipfrac': 0.001355932210572064, 'actor/ppo_kl': 0.00022209555027075112}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.08337999880313873, 'actor/pg_clipfrac': 0.0019083969527855515, 'actor/ppo_kl': 0.00025995998294092715}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.6617274880409241, 'actor/pg_clipfrac': 0.003846153849735856, 'actor/ppo_kl': 0.0019472885178402066}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.8173913359642029, 'actor/pg_clipfrac': 0.003692762227728963, 'actor/ppo_kl': 0.0020271888934075832}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.8039135336875916, 'actor/pg_clipfrac': 0.001832620589993894, 'actor/ppo_kl': -0.0016209585592150688}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.010424716398119926, 'actor/pg_clipfrac': 0.001226993859745562, 'actor/ppo_kl': -0.0014419209910556674}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0966983214020729, 'actor/pg_clipfrac': 0.0027881041169166565, 'actor/ppo_kl': 0.0010592307662591338}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.6478144526481628, 'actor/pg_clipfrac': 0.002347417874261737, 'actor/ppo_kl': -0.0008083800203166902}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.3549032211303711, 'actor/pg_clipfrac': 0.0011074197245761752, 'actor/ppo_kl': 0.0027159841265529394}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.06815548986196518, 'actor/pg_clipfrac': 0.002964426763355732, 'actor/ppo_kl': 0.0012680814834311604}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.039841100573539734, 'actor/pg_clipfrac': 0.0030156816355884075, 'actor/ppo_kl': 0.0018361704424023628}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.12424754351377487, 'actor/pg_clipfrac': 0.00784753356128931, 'actor/ppo_kl': -0.0022885019425302744}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.11522965133190155, 'actor/pg_clipfrac': 0.003607937367632985, 'actor/ppo_kl': 0.0017361586214974523}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.32279297709465027, 'actor/pg_clipfrac': 0.003863489953801036, 'actor/ppo_kl': 0.00012790043547283858}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.1137523353099823, 'actor/pg_clipfrac': 0.003267973894253373, 'actor/ppo_kl': 0.003996478393673897}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.026372287422418594, 'actor/pg_clipfrac': 0.0019047618843615055, 'actor/ppo_kl': -0.0010580626549199224}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3944624960422516, 'actor/pg_clipfrac': 0.003994673956185579, 'actor/ppo_kl': 0.001793190953321755}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.30910369753837585, 'actor/pg_clipfrac': 0.0020040080416947603, 'actor/ppo_kl': 0.0014268509112298489}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.39264941215515137, 'actor/pg_clipfrac': 0.005219206679612398, 'actor/ppo_kl': -0.00032673997338861227}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.12581032514572144, 'actor/pg_clipfrac': 0.0012738853693008423, 'actor/ppo_kl': -0.0008036157814785838}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.09014463424682617, 'actor/pg_clipfrac': 0.004282655194401741, 'actor/ppo_kl': 0.001053481362760067}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4820752441883087, 'actor/pg_clipfrac': 0.0028901733458042145, 'actor/ppo_kl': -0.0004544579715002328}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.4113541543483734, 'actor/pg_clipfrac': 0.0024375380016863346, 'actor/ppo_kl': 2.8428908990463242e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.5334363579750061, 'actor/pg_clipfrac': 0.0030581040773540735, 'actor/ppo_kl': 0.0002448907180223614}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1505710929632187, 'actor/pg_clipfrac': 0.0011422045063227415, 'actor/ppo_kl': 0.0007609264575876296}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.005097901914268732, 'actor/pg_clipfrac': 0.005563282407820225, 'actor/ppo_kl': -0.002430901164188981}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.6007726192474365, 'actor/pg_clipfrac': 0.0022753127850592136, 'actor/ppo_kl': -0.0018843921134248376}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.10707800090312958, 'actor/pg_clipfrac': 0.005212858319282532, 'actor/ppo_kl': 0.002325639594346285}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.41873061656951904, 'actor/pg_clipfrac': 0.005625000223517418, 'actor/ppo_kl': -0.00023211658117361367}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.15133821964263916, 'actor/pg_clipfrac': 0.0006734006456099451, 'actor/ppo_kl': 0.0015873224474489689}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.481330931186676, 'actor/pg_clipfrac': 0.0057708160020411015, 'actor/ppo_kl': 0.0037878891453146935}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.4472503066062927, 'actor/pg_clipfrac': 0.00285918521694839, 'actor/ppo_kl': -0.0008113280055113137}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2435605823993683, 'actor/pg_clipfrac': 0.0038784744683653116, 'actor/ppo_kl': 0.0011227132054045796}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.5270168781280518, 'actor/pg_clipfrac': 0.002465078141540289, 'actor/ppo_kl': -0.0005512755014933646}
[36m(Runner pid=3309020)[0m Step 4
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.431
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:24<1:42:10, 4.81s/it, est. speed input: 94.00 toks/s, output: 20.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<44:50, 2.12s/it, est. speed input: 183.61 toks/s, output: 39.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<24:59, 1.19s/it, est. speed input: 267.43 toks/s, output: 62.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<19:57, 1.05it/s, est. speed input: 314.38 toks/s, output: 79.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:29<13:37, 1.54it/s, est. speed input: 389.89 toks/s, output: 101.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:29<09:32, 2.18it/s, est. speed input: 459.60 toks/s, output: 118.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<07:13, 2.87it/s, est. speed input: 525.06 toks/s, output: 137.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<06:30, 3.18it/s, est. speed input: 578.51 toks/s, output: 150.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<05:43, 3.59it/s, est. speed input: 633.76 toks/s, output: 165.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<06:36, 3.10it/s, est. speed input: 662.52 toks/s, output: 177.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<05:29, 3.72it/s, est. speed input: 711.54 toks/s, output: 192.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<04:33, 4.46it/s, est. speed input: 763.53 toks/s, output: 210.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:36<03:26, 5.88it/s, est. speed input: 824.29 toks/s, output: 230.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<02:40, 7.56it/s, est. speed input: 879.84 toks/s, output: 251.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:13, 9.05it/s, est. speed input: 935.73 toks/s, output: 268.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:37<01:40, 11.93it/s, est. speed input: 1047.16 toks/s, output: 311.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<01:45, 11.26it/s, est. speed input: 1096.85 toks/s, output: 332.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:38<01:25, 13.82it/s, est. speed input: 1203.34 toks/s, output: 379.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<01:04, 18.01it/s, est. speed input: 1316.06 toks/s, output: 421.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:38<01:04, 18.14it/s, est. speed input: 1369.39 toks/s, output: 439.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:39<01:39, 11.69it/s, est. speed input: 1396.27 toks/s, output: 456.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:39<01:21, 14.22it/s, est. speed input: 1451.68 toks/s, output: 477.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:39<01:06, 17.20it/s, est. speed input: 1504.08 toks/s, output: 505.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:39<00:35, 31.71it/s, est. speed input: 1671.84 toks/s, output: 584.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:40<00:34, 32.56it/s, est. speed input: 1776.77 toks/s, output: 626.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:40<00:29, 37.97it/s, est. speed input: 1880.66 toks/s, output: 665.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:40<00:37, 29.30it/s, est. speed input: 1967.16 toks/s, output: 706.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:41<00:35, 31.20it/s, est. speed input: 2069.94 toks/s, output: 758.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:41<00:31, 34.86it/s, est. speed input: 2173.83 toks/s, output: 817.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:41<00:31, 34.35it/s, est. speed input: 2217.65 toks/s, output: 838.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:41<00:36, 29.68it/s, est. speed input: 2261.14 toks/s, output: 861.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:41<00:32, 32.50it/s, est. speed input: 2311.22 toks/s, output: 882.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:42<00:17, 61.26it/s, est. speed input: 2532.84 toks/s, output: 983.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:42<00:19, 52.83it/s, est. speed input: 2627.58 toks/s, output: 1026.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:42<00:25, 40.01it/s, est. speed input: 2710.37 toks/s, output: 1066.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:42<00:19, 52.17it/s, est. speed input: 2861.30 toks/s, output: 1139.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:42<00:17, 55.88it/s, est. speed input: 2959.31 toks/s, output: 1190.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:43<00:19, 51.70it/s, est. speed input: 3054.20 toks/s, output: 1233.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:43<00:17, 56.64it/s, est. speed input: 3143.38 toks/s, output: 1284.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:43<00:13, 72.85it/s, est. speed input: 3337.00 toks/s, output: 1400.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:43<00:14, 68.21it/s, est. speed input: 3427.34 toks/s, output: 1451.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:44<00:20, 45.01it/s, est. speed input: 3537.72 toks/s, output: 1495.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:44<00:14, 62.77it/s, est. speed input: 3738.40 toks/s, output: 1607.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:44<00:11, 76.07it/s, est. speed input: 3927.81 toks/s, output: 1719.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:44<00:11, 77.40it/s, est. speed input: 4022.79 toks/s, output: 1774.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:44<00:11, 78.61it/s, est. speed input: 4112.46 toks/s, output: 1822.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:45<00:09, 85.93it/s, est. speed input: 4385.35 toks/s, output: 1989.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:45<00:08, 102.64it/s, est. speed input: 4586.95 toks/s, output: 2084.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:45<00:07, 103.21it/s, est. speed input: 4732.23 toks/s, output: 2170.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:45<00:09, 87.70it/s, est. speed input: 4865.60 toks/s, output: 2244.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:45<00:07, 98.61it/s, est. speed input: 5007.53 toks/s, output: 2322.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:46<00:10, 71.02it/s, est. speed input: 5113.52 toks/s, output: 2396.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:46<00:10, 69.11it/s, est. speed input: 5198.36 toks/s, output: 2454.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:46<00:11, 64.66it/s, est. speed input: 5281.75 toks/s, output: 2500.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:46<00:09, 76.40it/s, est. speed input: 5410.39 toks/s, output: 2576.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:46<00:09, 73.40it/s, est. speed input: 5488.50 toks/s, output: 2631.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:46<00:09, 75.86it/s, est. speed input: 5564.49 toks/s, output: 2688.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:46<00:06, 109.25it/s, est. speed input: 5796.26 toks/s, output: 2804.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:47<00:06, 101.53it/s, est. speed input: 5919.32 toks/s, output: 2891.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:47<00:06, 100.93it/s, est. speed input: 6089.83 toks/s, output: 2988.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:47<00:07, 89.17it/s, est. speed input: 6212.48 toks/s, output: 3069.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:47<00:05, 105.41it/s, est. speed input: 6398.75 toks/s, output: 3184.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:47<00:05, 109.70it/s, est. speed input: 6534.77 toks/s, output: 3257.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:47<00:05, 108.28it/s, est. speed input: 6662.41 toks/s, output: 3355.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:48<00:05, 102.70it/s, est. speed input: 6784.12 toks/s, output: 3447.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:48<00:05, 96.59it/s, est. speed input: 6902.92 toks/s, output: 3517.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:48<00:06, 89.88it/s, est. speed input: 7021.39 toks/s, output: 3601.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:48<00:04, 111.29it/s, est. speed input: 7200.88 toks/s, output: 3726.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:48<00:06, 83.20it/s, est. speed input: 7298.76 toks/s, output: 3795.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:48<00:04, 100.73it/s, est. speed input: 7475.92 toks/s, output: 3924.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:49<00:04, 102.58it/s, est. speed input: 7598.13 toks/s, output: 4017.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:49<00:03, 119.14it/s, est. speed input: 7812.22 toks/s, output: 4154.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:49<00:04, 93.14it/s, est. speed input: 7910.01 toks/s, output: 4227.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:49<00:04, 84.23it/s, est. speed input: 8012.74 toks/s, output: 4303.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:49<00:04, 88.90it/s, est. speed input: 8126.41 toks/s, output: 4394.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:50<00:04, 84.70it/s, est. speed input: 8193.40 toks/s, output: 4474.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:50<00:04, 86.66it/s, est. speed input: 8268.06 toks/s, output: 4533.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:50<00:04, 89.31it/s, est. speed input: 8386.82 toks/s, output: 4616.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:50<00:03, 91.96it/s, est. speed input: 8491.74 toks/s, output: 4726.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:50<00:02, 123.69it/s, est. speed input: 8705.39 toks/s, output: 4895.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:50<00:02, 109.75it/s, est. speed input: 8808.81 toks/s, output: 4987.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:50<00:03, 88.69it/s, est. speed input: 8907.74 toks/s, output: 5080.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:51<00:02, 94.99it/s, est. speed input: 9060.36 toks/s, output: 5169.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:51<00:03, 77.28it/s, est. speed input: 9141.64 toks/s, output: 5242.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:51<00:03, 75.15it/s, est. speed input: 9206.14 toks/s, output: 5315.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:51<00:02, 87.22it/s, est. speed input: 9317.40 toks/s, output: 5413.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:51<00:01, 121.12it/s, est. speed input: 9564.71 toks/s, output: 5638.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:52<00:01, 121.52it/s, est. speed input: 9763.29 toks/s, output: 5804.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:52<00:00, 156.98it/s, est. speed input: 10134.72 toks/s, output: 6096.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:52<00:00, 123.35it/s, est. speed input: 10263.54 toks/s, output: 6228.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:52<00:00, 107.62it/s, est. speed input: 10358.20 toks/s, output: 6335.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:53<00:01, 79.60it/s, est. speed input: 10427.25 toks/s, output: 6434.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:53<00:01, 69.79it/s, est. speed input: 10465.60 toks/s, output: 6467.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:53<00:01, 64.91it/s, est. speed input: 10521.83 toks/s, output: 6539.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:53<00:00, 66.30it/s, est. speed input: 10578.51 toks/s, output: 6621.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:54<00:01, 39.54it/s, est. speed input: 10569.07 toks/s, output: 6631.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:54<00:01, 35.30it/s, est. speed input: 10582.57 toks/s, output: 6687.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:54<00:00, 35.26it/s, est. speed input: 10598.29 toks/s, output: 6721.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:55<00:01, 28.57it/s, est. speed input: 10580.44 toks/s, output: 6732.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:55<00:00, 20.71it/s, est. speed input: 10529.95 toks/s, output: 6750.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:56<00:01, 14.68it/s, est. speed input: 10428.52 toks/s, output: 6697.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:56<00:00, 14.71it/s, est. speed input: 10407.55 toks/s, output: 6699.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 20.02it/s, est. speed input: 10453.27 toks/s, output: 6749.88 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 22.41it/s, est. speed input: 10453.27 toks/s, output: 6749.88 toks/s]
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.009
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.002
[36m(Runner pid=3309020)[0m pg_loss: 0.048
[36m(Runner pid=3309020)[0m ppo_kl: 0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.074
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.074
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.45
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.45
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1014131
[36m(Runner pid=3309020)[0m balanced_min: 1013791
[36m(Runner pid=3309020)[0m max: 1017137
[36m(Runner pid=3309020)[0m mean: 1013961.0
[36m(Runner pid=3309020)[0m min: 1010785
[36m(Runner pid=3309020)[0m minmax_diff: 6352
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.034
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 70.736
[36m(Runner pid=3309020)[0m mfu_actor: 0.123
[36m(Runner pid=3309020)[0m throughput: 1139.244
[36m(Runner pid=3309020)[0m time_per_step: 890.03
[36m(Runner pid=3309020)[0m total_num_tokens: 2027922
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 467.033
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2985.0
[36m(Runner pid=3309020)[0m mean: 325.124
[36m(Runner pid=3309020)[0m min: 10.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.2
[36m(Runner pid=3309020)[0m format: 0.611
[36m(Runner pid=3309020)[0m overall: 0.45
[36m(Runner pid=3309020)[0m tag_reward: 0.837
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.147
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.282
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.334
[36m(Runner pid=3309020)[0m gen: 121.988
[36m(Runner pid=3309020)[0m old: 93.239
[36m(Runner pid=3309020)[0m ref: 95.699
[36m(Runner pid=3309020)[0m reward: 6.891
[36m(Runner pid=3309020)[0m step: 890.03
[36m(Runner pid=3309020)[0m update_actor: 571.199
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Step 5; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.70 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:30:05 [executor_base.py:219] It took 0.341645 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:31:36 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:30:05 [executor_base.py:219] It took 0.343435 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:31:37 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:31:37 [executor_base.py:208] It took 0.327231 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:31:37 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:31:37 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:31:37 [executor_base.py:208] It took 0.326519 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1066211462020874, 'actor/pg_clipfrac': 0.0027173913549631834, 'actor/ppo_kl': -0.0006287443684414029}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.37165674567222595, 'actor/pg_clipfrac': 0.0005934718064963818, 'actor/ppo_kl': -4.2536703404039145e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.6126663684844971, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.21884873509407043, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.44715678691864014, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.13314078748226166, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007307344931177795}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.11395271867513657, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016932896105572581}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.025064587593078613, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2145693004131317, 'actor/pg_clipfrac': 0.0006756756920367479, 'actor/ppo_kl': -0.0013335472904145718}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.16711710393428802, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.48697394132614136, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.39656558632850647, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.5393832325935364, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.050189074128866196, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.12202304601669312, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.2919694185256958, 'actor/pg_clipfrac': 0.0035906643606722355, 'actor/ppo_kl': 0.0007081023068167269}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.5603504776954651, 'actor/pg_clipfrac': 0.001578531926497817, 'actor/ppo_kl': 0.0013375685084611177}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.19597724080085754, 'actor/pg_clipfrac': 0.0012594457948580384, 'actor/ppo_kl': -0.0017163519514724612}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.005797103047370911, 'actor/pg_clipfrac': 0.002074688905850053, 'actor/ppo_kl': -0.0001717860286589712}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.16950762271881104, 'actor/pg_clipfrac': 0.0020463846158236265, 'actor/ppo_kl': -0.0009159644832834601}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.06501208990812302, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006102181505411863}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.36485010385513306, 'actor/pg_clipfrac': 0.0024922117590904236, 'actor/ppo_kl': 0.0003198855265509337}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 1.1285438537597656, 'actor/pg_clipfrac': 0.0044843051582574844, 'actor/ppo_kl': 0.0030855427030473948}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.03285476192831993, 'actor/pg_clipfrac': 0.00231660227291286, 'actor/ppo_kl': -1.636195884202607e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.41884759068489075, 'actor/pg_clipfrac': 0.0006925207562744617, 'actor/ppo_kl': -0.00019128236453980207}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.045438170433044434, 'actor/pg_clipfrac': 0.001485884073190391, 'actor/ppo_kl': 0.0006271507008932531}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3052199184894562, 'actor/pg_clipfrac': 0.0017391304718330503, 'actor/ppo_kl': 0.0015151911647990346}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.18925409018993378, 'actor/pg_clipfrac': 0.0010362694738432765, 'actor/ppo_kl': -0.0012676852056756616}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.28258490562438965, 'actor/pg_clipfrac': 0.003389830468222499, 'actor/ppo_kl': 0.0014682019827887416}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.21086876094341278, 'actor/pg_clipfrac': 0.0030604437924921513, 'actor/ppo_kl': 0.0004948832793161273}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.5973389744758606, 'actor/pg_clipfrac': 0.0015748031437397003, 'actor/ppo_kl': 0.0011277085868641734}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.5570568442344666, 'actor/pg_clipfrac': 0.0030840400140732527, 'actor/ppo_kl': 0.00030968166538514197}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.38126084208488464, 'actor/pg_clipfrac': 0.0032025619875639677, 'actor/ppo_kl': 0.0021157809533178806}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.20382563769817352, 'actor/pg_clipfrac': 0.0023942538537085056, 'actor/ppo_kl': -0.00028158281929790974}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.18038396537303925, 'actor/pg_clipfrac': 0.003105590119957924, 'actor/ppo_kl': -0.00035391683923080564}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.6801029443740845, 'actor/pg_clipfrac': 0.001036806614138186, 'actor/ppo_kl': 0.0003504513297230005}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.174666166305542, 'actor/pg_clipfrac': 0.002955665113404393, 'actor/ppo_kl': 0.0007349775405600667}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.06570395827293396, 'actor/pg_clipfrac': 0.0060901339165866375, 'actor/ppo_kl': -0.000980755896307528}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 8.63383975229226e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00037230888847261667}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.08586447685956955, 'actor/pg_clipfrac': 0.003496503457427025, 'actor/ppo_kl': -0.0006480228039436042}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.44700324535369873, 'actor/pg_clipfrac': 0.003086419776082039, 'actor/ppo_kl': 0.0007404795615002513}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.9556289315223694, 'actor/pg_clipfrac': 0.006802720949053764, 'actor/ppo_kl': 0.0007723779417574406}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.28624603152275085, 'actor/pg_clipfrac': 0.0012634238228201866, 'actor/ppo_kl': 0.0009309114539064467}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.15010756254196167, 'actor/pg_clipfrac': 0.0016977929044514894, 'actor/ppo_kl': -0.0016413560369983315}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.016544025391340256, 'actor/pg_clipfrac': 0.003197442041710019, 'actor/ppo_kl': 1.741313280945178e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.5460610389709473, 'actor/pg_clipfrac': 0.0055741360411047935, 'actor/ppo_kl': -0.00014505205035675317}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.22125811874866486, 'actor/pg_clipfrac': 0.0031225604470819235, 'actor/ppo_kl': -0.0017806841060519218}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.9308961033821106, 'actor/pg_clipfrac': 0.0024509804788976908, 'actor/ppo_kl': -0.00028403912438079715}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.06554176658391953, 'actor/pg_clipfrac': 0.0023547881282866, 'actor/ppo_kl': 0.0005389562575146556}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5534708499908447, 'actor/pg_clipfrac': 0.004640371073037386, 'actor/ppo_kl': 4.9980499170487747e-05}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2018233686685562, 'actor/pg_clipfrac': 0.0009433962404727936, 'actor/ppo_kl': -0.0009661836666055024}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0614144504070282, 'actor/pg_clipfrac': 0.00663129985332489, 'actor/ppo_kl': -0.0008334405138157308}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.580050528049469, 'actor/pg_clipfrac': 0.0035211266949772835, 'actor/ppo_kl': -0.0010509100975468755}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.05510333552956581, 'actor/pg_clipfrac': 0.0007320644217543304, 'actor/ppo_kl': 0.0005906664882786572}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.305435448884964, 'actor/pg_clipfrac': 0.00651890505105257, 'actor/ppo_kl': 0.0028929305262863636}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.8978687524795532, 'actor/pg_clipfrac': 0.0038255546241998672, 'actor/ppo_kl': -0.00027314634644426405}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.8387656807899475, 'actor/pg_clipfrac': 0.008130080997943878, 'actor/ppo_kl': -0.00017070614558178931}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:35:00, 15.16s/it, est. speed input: 30.93 toks/s, output: 4.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<39:52, 6.38s/it, est. speed input: 60.74 toks/s, output: 9.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:15<22:04, 3.54s/it, est. speed input: 90.37 toks/s, output: 15.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 4/377 [00:15<14:07, 2.27s/it, est. speed input: 117.52 toks/s, output: 21.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 8/377 [00:16<04:28, 1.38it/s, est. speed input: 232.32 toks/s, output: 45.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 11/377 [00:16<02:39, 2.30it/s, est. speed input: 317.39 toks/s, output: 64.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 15/377 [00:16<01:33, 3.87it/s, est. speed input: 427.48 toks/s, output: 88.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 19/377 [00:16<01:02, 5.76it/s, est. speed input: 534.03 toks/s, output: 114.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 23/377 [00:16<00:43, 8.13it/s, est. speed input: 640.48 toks/s, output: 141.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 26/377 [00:16<00:36, 9.67it/s, est. speed input: 717.37 toks/s, output: 161.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 30/377 [00:16<00:28, 12.31it/s, est. speed input: 823.47 toks/s, output: 188.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 35/377 [00:17<00:21, 16.09it/s, est. speed input: 949.53 toks/s, output: 225.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 39/377 [00:17<00:18, 18.15it/s, est. speed input: 1048.20 toks/s, output: 254.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 43/377 [00:17<00:15, 20.92it/s, est. speed input: 1147.60 toks/s, output: 283.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 47/377 [00:17<00:14, 23.39it/s, est. speed input: 1244.19 toks/s, output: 313.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 50/377 [00:17<00:15, 21.11it/s, est. speed input: 1310.02 toks/s, output: 335.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 58/377 [00:17<00:10, 31.27it/s, est. speed input: 1508.53 toks/s, output: 400.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 66/377 [00:17<00:07, 39.80it/s, est. speed input: 1705.79 toks/s, output: 467.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 73/377 [00:18<00:06, 44.50it/s, est. speed input: 1874.93 toks/s, output: 525.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 84/377 [00:18<00:05, 57.22it/s, est. speed input: 2141.01 toks/s, output: 622.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 91/377 [00:18<00:05, 54.38it/s, est. speed input: 2299.83 toks/s, output: 682.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 97/377 [00:18<00:05, 53.91it/s, est. speed input: 2437.77 toks/s, output: 735.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 103/377 [00:18<00:05, 47.23it/s, est. speed input: 2565.53 toks/s, output: 787.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 109/377 [00:18<00:06, 43.37it/s, est. speed input: 2702.31 toks/s, output: 838.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 114/377 [00:18<00:06, 41.29it/s, est. speed input: 2805.88 toks/s, output: 881.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 124/377 [00:19<00:05, 48.84it/s, est. speed input: 3025.55 toks/s, output: 974.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 129/377 [00:19<00:06, 40.64it/s, est. speed input: 3118.83 toks/s, output: 1016.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 135/377 [00:19<00:05, 42.91it/s, est. speed input: 3245.66 toks/s, output: 1074.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 149/377 [00:19<00:03, 62.27it/s, est. speed input: 3561.10 toks/s, output: 1218.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 156/377 [00:19<00:03, 59.00it/s, est. speed input: 3704.44 toks/s, output: 1287.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 163/377 [00:19<00:04, 51.94it/s, est. speed input: 3836.41 toks/s, output: 1356.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 169/377 [00:19<00:03, 52.31it/s, est. speed input: 3955.34 toks/s, output: 1418.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 177/377 [00:20<00:03, 57.94it/s, est. speed input: 4122.76 toks/s, output: 1503.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 185/377 [00:20<00:03, 62.54it/s, est. speed input: 4287.53 toks/s, output: 1589.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 192/377 [00:20<00:03, 60.53it/s, est. speed input: 4422.13 toks/s, output: 1665.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 199/377 [00:20<00:02, 62.74it/s, est. speed input: 4559.02 toks/s, output: 1743.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 206/377 [00:20<00:02, 61.41it/s, est. speed input: 4691.63 toks/s, output: 1820.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 213/377 [00:20<00:02, 57.86it/s, est. speed input: 4818.66 toks/s, output: 1897.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 221/377 [00:20<00:02, 53.52it/s, est. speed input: 4968.79 toks/s, output: 1987.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 227/377 [00:20<00:03, 49.10it/s, est. speed input: 5068.49 toks/s, output: 2052.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 236/377 [00:21<00:02, 54.62it/s, est. speed input: 5237.53 toks/s, output: 2162.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 246/377 [00:21<00:02, 63.66it/s, est. speed input: 5434.44 toks/s, output: 2289.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 253/377 [00:21<00:01, 62.86it/s, est. speed input: 5560.80 toks/s, output: 2376.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 260/377 [00:21<00:01, 62.98it/s, est. speed input: 5687.78 toks/s, output: 2465.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 267/377 [00:21<00:01, 56.94it/s, est. speed input: 5807.07 toks/s, output: 2552.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 275/377 [00:21<00:01, 59.02it/s, est. speed input: 5949.81 toks/s, output: 2656.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 282/377 [00:21<00:01, 51.94it/s, est. speed input: 6053.67 toks/s, output: 2742.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▋ | 288/377 [00:21<00:01, 52.95it/s, est. speed input: 6153.54 toks/s, output: 2825.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 295/377 [00:22<00:01, 56.56it/s, est. speed input: 6275.63 toks/s, output: 2922.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 302/377 [00:22<00:01, 57.47it/s, est. speed input: 6393.81 toks/s, output: 3022.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 308/377 [00:22<00:01, 51.36it/s, est. speed input: 6477.04 toks/s, output: 3102.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 314/377 [00:22<00:01, 46.42it/s, est. speed input: 6558.71 toks/s, output: 3184.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 319/377 [00:22<00:01, 40.90it/s, est. speed input: 6618.94 toks/s, output: 3248.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 324/377 [00:22<00:01, 35.58it/s, est. speed input: 6667.91 toks/s, output: 3311.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 328/377 [00:23<00:01, 31.77it/s, est. speed input: 6703.07 toks/s, output: 3362.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 334/377 [00:23<00:01, 37.22it/s, est. speed input: 6798.61 toks/s, output: 3463.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 339/377 [00:23<00:01, 34.23it/s, est. speed input: 6848.54 toks/s, output: 3535.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 343/377 [00:23<00:01, 32.65it/s, est. speed input: 6890.95 toks/s, output: 3594.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 347/377 [00:23<00:01, 27.84it/s, est. speed input: 6913.51 toks/s, output: 3647.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 352/377 [00:23<00:00, 27.31it/s, est. speed input: 6960.82 toks/s, output: 3725.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 355/377 [00:24<00:00, 24.93it/s, est. speed input: 6974.07 toks/s, output: 3766.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 358/377 [00:24<00:00, 19.18it/s, est. speed input: 6954.48 toks/s, output: 3793.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:24<00:01, 14.56it/s, est. speed input: 6912.28 toks/s, output: 3810.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 363/377 [00:24<00:01, 13.10it/s, est. speed input: 6890.33 toks/s, output: 3827.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [00:25<00:00, 12.78it/s, est. speed input: 6880.92 toks/s, output: 3867.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [00:25<00:00, 12.86it/s, est. speed input: 6877.89 toks/s, output: 3898.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [00:25<00:00, 8.47it/s, est. speed input: 6783.06 toks/s, output: 3880.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:41<00:00, 8.47it/s, est. speed input: 6561.34 toks/s, output: 3773.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▊| 372/377 [00:48<00:14, 2.96s/it, est. speed input: 3625.36 toks/s, output: 2152.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:49<00:10, 2.68s/it, est. speed input: 3551.39 toks/s, output: 2176.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [00:50<00:06, 2.33s/it, est. speed input: 3510.92 toks/s, output: 2220.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 375/377 [00:55<00:05, 2.95s/it, est. speed input: 3180.80 toks/s, output: 2085.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 376/377 [01:04<00:04, 4.23s/it, est. speed input: 2763.23 toks/s, output: 1890.92 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:04<00:00, 5.85it/s, est. speed input: 2767.63 toks/s, output: 1974.44 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.21838928759098053, 'actor/pg_clipfrac': 0.005328597035259008, 'actor/ppo_kl': 0.004209088161587715}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 5.315240196068771e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012811555061489344}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.71950364112854, 'actor/pg_clipfrac': 0.006211180239915848, 'actor/ppo_kl': 0.004044070839881897}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.0476205050945282, 'actor/pg_clipfrac': 0.0036697247996926308, 'actor/ppo_kl': -0.000798531633336097}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.20016440749168396, 'actor/pg_clipfrac': 0.008939974009990692, 'actor/ppo_kl': -0.0015272673917934299}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.8543013334274292, 'actor/pg_clipfrac': 0.0006027727504260838, 'actor/ppo_kl': 0.0008463810663670301}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.208152174949646, 'actor/pg_clipfrac': 0.007252946496009827, 'actor/ppo_kl': 0.0014820098876953125}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:44:55 [executor_base.py:219] It took 0.340659 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:46:44 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:44:55 [executor_base.py:219] It took 0.340565 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:46:44 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:46:44 [executor_base.py:208] It took 0.327123 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:46:44 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:46:45 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:46:45 [executor_base.py:208] It took 0.325882 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to consider the geometric properties and the given information. Since D is the midpoint of AB, AD = DB = 2.0. The perimeter of triangle ABC is 16.0, so AC + BC = 16.0 - AB = 16.0 - 4.0 = 12.0.\n\nThe perimeter of triangle AEC is AC + AE + EC. Since AE is a variable length, we need to minimize the sum of AC and EC. The minimum distance between a point and a line is the perpendicular distance. Therefore, the minimum perimeter of triangle AEC occurs when AE is perpendicular to line l passing through D. This makes AE the shortest possible distance from A to line l.\n\nSince AE is perpendicular to line l and D is the midpoint of AB, AE will be half the length of AB, which is 2.0. Therefore, AE = 2.0.\n\nNow, we need to find the length of EC. Since E is on line l, which is perpendicular to AB, EC will be the same as AC because E is directly above A in the perpendicular direction.\n\nSo, AC + AE + EC = AC + AE + AC = 2 * AC + AE = 2 * 6.0 + 2.0 = 12.0 + 2.0 = 14.0.\n\nTherefore, the minimum perimeter of triangle AEC is 14.0.\n\n\nThe minimum perimeter of triangle AEC is 14.0.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nTo solve this problem, let's consider the properties of the rotation and the given angles.\n\n1. Since triangle ABC is rotated clockwise around point A by 90 degrees to form triangle AB'C', the rotation preserves the lengths of the sides and the angles. Therefore, angle BAC = angle B'A'C' = 90 degrees.\n2. The rotation also preserves the angles at point A, so angle B'A'C' = 90 degrees.\n3. Since angle CC'B' = 32 degrees, and considering that angle B'C'A' = 90 degrees (from the rotation), we can find angle AC'B' by subtracting angle CC'B' from angle B'C'A'.\n4. Thus, angle AC'B' = angle B'C'A' - angle CC'B' = 90 degrees - 32 degrees = 58 degrees.\n\n\nThe size of angle AC'B' is 58 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that A, B, and D are on circle O, and C is on the diameter BE, we can use properties of circles and angles in a circle. Since A, B, and D are on circle O, angle ADC is an inscribed angle that intercepts the arc AD. The measure of an inscribed angle is half the measure of the central angle that intercepts the same arc. Thus, angle AOB, which is the central angle intercepting arc AD, is twice the measure of angle ADC, which is 54 degrees. So, angle AOB is 108 degrees.\n\nSince C is on the diameter BE, angle AEC is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Therefore, angle AEB is the sum of angle AEC and angle AEB, which is 90 degrees plus the measure of angle AOB, which is 108 degrees. Thus, angle AEB is 198 degrees.\n\nHowever, this calculation seems incorrect because angles in a circle cannot exceed 360 degrees. Let's re-evaluate the problem. Since A, B, and D are on circle O, angle ADB is also an inscribed angle that intercepts the same arc AD. Therefore, angle ADB is half the measure of angle AOB, which is 54 degrees. \n\nNow, since angle ADB is an inscribed angle in the circle, it intercepts the arc AD. This means angle AEB, which is the angle at the circumference opposite to the inscribed angle ADB, must be half of the central angle AOB. But we need to correct our approach. Since C is on the diameter BE, and BE is a straight line, angle AEB is actually the supplementary angle to angle ADB at the circumference. Therefore, angle AEB is 180 degrees - angle ADB, which is 180 degrees - 54 degrees = 126 degrees.\n\nSo, the correct measure of angle AEB is 126 degrees.\n\n\n126 degrees\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AB is the diameter of circle O, and CD is tangent to circle O at point D, we can use the properties of tangents and circles to find angle C. Since CD is tangent to circle O, angle ODC is a right angle (90 degrees) because the radius of a circle is perpendicular to the tangent at the point of tangency.\n\nGiven angle A is 35 degrees, we can use the fact that angle AOB is twice angle A because the angle subtended by an arc at the center is twice the angle subtended by the same arc at any point on the remaining part of the circle. Therefore, angle AOB is 2 * 35 degrees = 70 degrees.\n\nSince angle ODC is 90 degrees and angle AOB is 70 degrees, angle C can be found by subtracting the sum of these angles from 180 degrees, as they form a straight line. So, angle C = 180 degrees - 90 degrees - 70 degrees = 20 degrees.\n\n\n20 degrees\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, angle ACD is a right angle (90 degrees) because any angle subtended by a diameter in a circle is a right angle. Since EA is tangent to circle O at point A, angle EAC is 120 degrees. In triangle ACD, angle ACD is 90 degrees and angle EAC is 120 degrees. The sum of angles in a triangle is 180 degrees, so angle CAD can be found by subtracting the sum of the other two angles from 180 degrees. Therefore, angle CAD = 180 - 90 - 120 = 60 degrees. Since AD is the diameter, angle ABD is a right angle (90 degrees) and angle DBC is half of angle CAD, which is 30 degrees. Therefore, angle ABC is the sum of angle ABD and angle DBC, which is 90 + 30 = 120 degrees.\n\n\n120\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_5/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_5/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_5/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 5
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.375
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.012
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.002
[36m(Runner pid=3309020)[0m pg_loss: 0.029
[36m(Runner pid=3309020)[0m ppo_kl: 0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.047
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.047
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.539
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.539
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 982791
[36m(Runner pid=3309020)[0m balanced_min: 982790
[36m(Runner pid=3309020)[0m max: 982889
[36m(Runner pid=3309020)[0m mean: 982790.5
[36m(Runner pid=3309020)[0m min: 982692
[36m(Runner pid=3309020)[0m minmax_diff: 197
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 102.59
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 70.736
[36m(Runner pid=3309020)[0m mfu_actor: 0.12
[36m(Runner pid=3309020)[0m throughput: 934.755
[36m(Runner pid=3309020)[0m time_per_step: 1051.388
[36m(Runner pid=3309020)[0m total_num_tokens: 1965581
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 666.0
[36m(Runner pid=3309020)[0m mean: 466.17
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1312.0
[36m(Runner pid=3309020)[0m mean: 301.635
[36m(Runner pid=3309020)[0m min: 49.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.211
[36m(Runner pid=3309020)[0m format: 0.82
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:25<1:47:23, 5.05s/it, est. speed input: 103.89 toks/s, output: 19.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:28<53:11, 2.51s/it, est. speed input: 169.82 toks/s, output: 36.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:29<29:31, 1.40s/it, est. speed input: 243.99 toks/s, output: 61.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<14:15, 1.47it/s, est. speed input: 386.92 toks/s, output: 92.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<10:28, 1.99it/s, est. speed input: 456.06 toks/s, output: 112.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<08:03, 2.58it/s, est. speed input: 520.05 toks/s, output: 131.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<06:12, 3.33it/s, est. speed input: 584.05 toks/s, output: 153.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<04:54, 4.19it/s, est. speed input: 644.61 toks/s, output: 175.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:32<03:42, 5.53it/s, est. speed input: 714.54 toks/s, output: 195.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:33<03:55, 5.20it/s, est. speed input: 757.73 toks/s, output: 208.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:33<02:55, 6.96it/s, est. speed input: 823.32 toks/s, output: 227.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:34<01:34, 12.69it/s, est. speed input: 1015.23 toks/s, output: 287.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:34<01:07, 17.80it/s, est. speed input: 1141.61 toks/s, output: 325.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:34<01:06, 17.91it/s, est. speed input: 1198.69 toks/s, output: 345.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:35<01:12, 16.31it/s, est. speed input: 1249.91 toks/s, output: 362.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:35<00:40, 28.95it/s, est. speed input: 1494.29 toks/s, output: 448.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:35<00:35, 32.90it/s, est. speed input: 1618.65 toks/s, output: 495.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:35<00:28, 39.56it/s, est. speed input: 1734.79 toks/s, output: 540.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:36<00:32, 34.99it/s, est. speed input: 1847.29 toks/s, output: 576.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:36<00:31, 35.93it/s, est. speed input: 1906.90 toks/s, output: 598.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:36<00:25, 44.17it/s, est. speed input: 2031.29 toks/s, output: 646.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:36<00:32, 34.08it/s, est. speed input: 2186.36 toks/s, output: 715.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:37<00:29, 37.27it/s, est. speed input: 2355.37 toks/s, output: 773.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:37<00:21, 49.73it/s, est. speed input: 2531.85 toks/s, output: 846.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:37<00:18, 56.06it/s, est. speed input: 2651.32 toks/s, output: 890.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:37<00:19, 53.30it/s, est. speed input: 2750.50 toks/s, output: 932.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:37<00:21, 48.33it/s, est. speed input: 2849.80 toks/s, output: 972.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:38<00:25, 40.05it/s, est. speed input: 2943.62 toks/s, output: 1010.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:38<00:18, 54.45it/s, est. speed input: 3169.53 toks/s, output: 1107.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:38<00:14, 68.48it/s, est. speed input: 3336.28 toks/s, output: 1183.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:38<00:14, 68.05it/s, est. speed input: 3444.89 toks/s, output: 1238.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:38<00:12, 76.30it/s, est. speed input: 3612.02 toks/s, output: 1305.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:39<00:18, 51.59it/s, est. speed input: 3695.42 toks/s, output: 1346.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:39<00:16, 58.33it/s, est. speed input: 3854.21 toks/s, output: 1417.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:39<00:16, 57.15it/s, est. speed input: 3957.50 toks/s, output: 1465.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:39<00:13, 69.15it/s, est. speed input: 4125.34 toks/s, output: 1536.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:40<00:15, 58.09it/s, est. speed input: 4215.63 toks/s, output: 1569.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:40<00:19, 46.40it/s, est. speed input: 4294.63 toks/s, output: 1594.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:40<00:17, 52.46it/s, est. speed input: 4395.75 toks/s, output: 1633.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:40<00:13, 66.49it/s, est. speed input: 4550.63 toks/s, output: 1697.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:40<00:12, 70.08it/s, est. speed input: 4655.37 toks/s, output: 1756.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:41<00:16, 52.78it/s, est. speed input: 4731.65 toks/s, output: 1797.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:41<00:17, 49.29it/s, est. speed input: 4817.90 toks/s, output: 1854.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:41<00:13, 60.55it/s, est. speed input: 4964.49 toks/s, output: 1929.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:41<00:14, 56.12it/s, est. speed input: 5049.21 toks/s, output: 1982.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:41<00:09, 84.17it/s, est. speed input: 5306.70 toks/s, output: 2110.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:41<00:10, 77.24it/s, est. speed input: 5393.67 toks/s, output: 2160.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:42<00:10, 76.66it/s, est. speed input: 5487.51 toks/s, output: 2213.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:42<00:07, 95.96it/s, est. speed input: 5692.93 toks/s, output: 2327.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:42<00:06, 121.94it/s, est. speed input: 5942.11 toks/s, output: 2461.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:42<00:06, 107.35it/s, est. speed input: 6083.25 toks/s, output: 2518.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:42<00:09, 77.19it/s, est. speed input: 6195.84 toks/s, output: 2581.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:42<00:08, 85.93it/s, est. speed input: 6338.11 toks/s, output: 2655.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:43<00:07, 85.51it/s, est. speed input: 6475.45 toks/s, output: 2719.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:43<00:10, 62.33it/s, est. speed input: 6536.25 toks/s, output: 2756.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:43<00:09, 71.49it/s, est. speed input: 6681.26 toks/s, output: 2825.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:43<00:07, 88.14it/s, est. speed input: 6866.13 toks/s, output: 2942.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:43<00:05, 104.97it/s, est. speed input: 7062.46 toks/s, output: 3034.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:44<00:05, 109.56it/s, est. speed input: 7200.04 toks/s, output: 3105.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:44<00:05, 103.44it/s, est. speed input: 7339.25 toks/s, output: 3197.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:44<00:05, 104.04it/s, est. speed input: 7469.81 toks/s, output: 3285.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:44<00:05, 109.87it/s, est. speed input: 7614.50 toks/s, output: 3371.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:44<00:04, 126.66it/s, est. speed input: 7803.59 toks/s, output: 3489.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:44<00:05, 94.41it/s, est. speed input: 7930.80 toks/s, output: 3559.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:45<00:05, 89.96it/s, est. speed input: 8055.88 toks/s, output: 3629.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:45<00:04, 97.84it/s, est. speed input: 8186.23 toks/s, output: 3727.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:45<00:04, 106.45it/s, est. speed input: 8368.92 toks/s, output: 3823.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:45<00:04, 111.24it/s, est. speed input: 8502.73 toks/s, output: 3916.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:45<00:04, 106.87it/s, est. speed input: 8633.15 toks/s, output: 4006.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:45<00:03, 124.08it/s, est. speed input: 8820.69 toks/s, output: 4119.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:45<00:03, 125.34it/s, est. speed input: 9000.84 toks/s, output: 4275.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:46<00:03, 96.91it/s, est. speed input: 9106.43 toks/s, output: 4343.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:46<00:03, 111.63it/s, est. speed input: 9290.68 toks/s, output: 4454.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:46<00:03, 105.50it/s, est. speed input: 9511.71 toks/s, output: 4613.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:46<00:03, 94.60it/s, est. speed input: 9613.04 toks/s, output: 4697.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:46<00:03, 100.52it/s, est. speed input: 9799.79 toks/s, output: 4802.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:46<00:02, 112.61it/s, est. speed input: 9968.80 toks/s, output: 4911.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:47<00:02, 103.95it/s, est. speed input: 10078.38 toks/s, output: 5004.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:47<00:02, 101.08it/s, est. speed input: 10189.00 toks/s, output: 5088.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:47<00:03, 72.91it/s, est. speed input: 10260.10 toks/s, output: 5167.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:47<00:03, 64.01it/s, est. speed input: 10338.99 toks/s, output: 5243.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:48<00:02, 81.08it/s, est. speed input: 10517.14 toks/s, output: 5379.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:48<00:02, 68.19it/s, est. speed input: 10590.79 toks/s, output: 5452.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:48<00:02, 66.05it/s, est. speed input: 10654.20 toks/s, output: 5509.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:48<00:02, 76.59it/s, est. speed input: 10788.19 toks/s, output: 5620.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:48<00:01, 89.87it/s, est. speed input: 10910.74 toks/s, output: 5748.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:48<00:01, 79.83it/s, est. speed input: 11001.34 toks/s, output: 5883.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:49<00:01, 74.26it/s, est. speed input: 11049.66 toks/s, output: 5952.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:49<00:01, 72.37it/s, est. speed input: 11150.89 toks/s, output: 6090.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:49<00:01, 64.93it/s, est. speed input: 11200.21 toks/s, output: 6157.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:49<00:01, 70.82it/s, est. speed input: 11305.69 toks/s, output: 6294.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:49<00:01, 72.77it/s, est. speed input: 11371.23 toks/s, output: 6365.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:50<00:01, 46.37it/s, est. speed input: 11367.33 toks/s, output: 6401.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:50<00:01, 31.88it/s, est. speed input: 11327.61 toks/s, output: 6415.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:51<00:01, 33.47it/s, est. speed input: 11350.39 toks/s, output: 6436.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:51<00:01, 32.38it/s, est. speed input: 11358.48 toks/s, output: 6465.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:51<00:01, 23.07it/s, est. speed input: 11302.31 toks/s, output: 6477.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:52<00:01, 24.69it/s, est. speed input: 11312.11 toks/s, output: 6504.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 30.02it/s, est. speed input: 11357.32 toks/s, output: 6570.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:52<00:00, 27.33it/s, est. speed input: 11348.88 toks/s, output: 6580.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:53<00:00, 12.20it/s, est. speed input: 11141.63 toks/s, output: 6493.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:54<00:00, 9.00it/s, est. speed input: 10975.22 toks/s, output: 6410.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:02<00:00, 2.10it/s, est. speed input: 9677.16 toks/s, output: 5688.29 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:02<00:00, 20.57it/s, est. speed input: 9677.16 toks/s, output: 5688.29 toks/s]
[36m(Runner pid=3309020)[0m overall: 0.539
[36m(Runner pid=3309020)[0m tag_reward: 0.939
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.141
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.288
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.212
[36m(Runner pid=3309020)[0m gen: 108.834
[36m(Runner pid=3309020)[0m old: 91.174
[36m(Runner pid=3309020)[0m ref: 89.903
[36m(Runner pid=3309020)[0m reward: 6.616
[36m(Runner pid=3309020)[0m save_checkpoint: 31.704
[36m(Runner pid=3309020)[0m step: 1051.388
[36m(Runner pid=3309020)[0m update_actor: 566.204
[36m(Runner pid=3309020)[0m validation: 155.743
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.292
[36m(Runner pid=3309020)[0m format_reward: 0.963
[36m(Runner pid=3309020)[0m overall_reward: 0.632
[36m(Runner pid=3309020)[0m reward_score: 0.632
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.985
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 6; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_5/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_5/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_5/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:47:38 [executor_base.py:219] It took 0.353172 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:49:35 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:47:38 [executor_base.py:219] It took 0.340248 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:49:35 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 22:49:35 [executor_base.py:208] It took 0.329653 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.79 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.87 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:49:38 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:49:38 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.87 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 22:49:38 [executor_base.py:208] It took 0.327060 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.18919135630130768, 'actor/pg_clipfrac': 0.0014992504147812724, 'actor/ppo_kl': -0.0005726134986616671}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.5441110134124756, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.6048597693443298, 'actor/pg_clipfrac': 0.002797202905640006, 'actor/ppo_kl': 0.0008436576463282108}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.14323122799396515, 'actor/pg_clipfrac': 0.00128369708545506, 'actor/ppo_kl': -0.0005628062062896788}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.1171397939324379, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.2352742999792099, 'actor/pg_clipfrac': 0.0014836795162409544, 'actor/ppo_kl': -0.0024400718975812197}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.20350241661071777, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3394056260585785, 'actor/pg_clipfrac': 0.0015748031437397003, 'actor/ppo_kl': -0.0009396800887770951}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.30406346917152405, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.16788215935230255, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.12752272188663483, 'actor/pg_clipfrac': 0.0018587360391393304, 'actor/ppo_kl': 0.0022444482892751694}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00011873270705109462, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00046314438804984093}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.17093361914157867, 'actor/pg_clipfrac': 0.0012755101779475808, 'actor/ppo_kl': -0.0004590068419929594}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2308061420917511, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.13017380237579346, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.8377732634544373, 'actor/pg_clipfrac': 0.0008771930006332695, 'actor/ppo_kl': -0.0003246763371862471}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.12482468038797379, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006455841357819736}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.29622867703437805, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007027008105069399}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00010941783693851903, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018309913575649261}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 5.909151877858676e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004267763579264283}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.11889130622148514, 'actor/pg_clipfrac': 0.0009460737928748131, 'actor/ppo_kl': 0.0006099184975028038}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.1479230374097824, 'actor/pg_clipfrac': 0.004268943332135677, 'actor/ppo_kl': -0.0024057936388999224}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.11916924268007278, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.860134580056183e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.36003559827804565, 'actor/pg_clipfrac': 0.004189944360405207, 'actor/ppo_kl': -0.00301483366638422}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 9.626895189285278e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006033196696080267}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.4472205340862274, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00021398525859694928}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3518683612346649, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00041021633660420775}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.29873621463775635, 'actor/pg_clipfrac': 0.0006082725012674928, 'actor/ppo_kl': 0.00018822422134689987}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.5121679902076721, 'actor/pg_clipfrac': 0.0021551724057644606, 'actor/ppo_kl': -0.0005351448780857027}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 8.016990614123642e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004346407949924469}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.5750166773796082, 'actor/pg_clipfrac': 0.0026281208265572786, 'actor/ppo_kl': -0.000281217391602695}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.15889675915241241, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002238202840089798}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.02436131425201893, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002165770623832941}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2202020287513733, 'actor/pg_clipfrac': 0.0027662517968565226, 'actor/ppo_kl': -0.0008733519935049117}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.25263023376464844, 'actor/pg_clipfrac': 0.0016570008592680097, 'actor/ppo_kl': 0.0002958902914542705}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5441900491714478, 'actor/pg_clipfrac': 0.000892060634214431, 'actor/ppo_kl': 0.00010504539386602119}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5216394662857056, 'actor/pg_clipfrac': 0.0019417476141825318, 'actor/ppo_kl': 0.0008135638199746609}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.03705505654215813, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000318810751195997}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.13203735649585724, 'actor/pg_clipfrac': 0.0014684287598356605, 'actor/ppo_kl': -0.0017857712227851152}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.38046613335609436, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010619827080518007}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.4478193521499634, 'actor/pg_clipfrac': 0.005263158120214939, 'actor/ppo_kl': -0.0011497095692902803}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.3802042305469513, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011105885496363044}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.4032150208950043, 'actor/pg_clipfrac': 0.0016528925625607371, 'actor/ppo_kl': -0.0008964680600911379}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5908042192459106, 'actor/pg_clipfrac': 0.0010899183107540011, 'actor/ppo_kl': 0.0007105138502083719}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.3477995991706848, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008540069684386253}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.02703632041811943, 'actor/pg_clipfrac': 0.001623376621864736, 'actor/ppo_kl': -0.0008635118138045073}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.26995983719825745, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012932991376146674}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4470061659812927, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008922365959733725}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.3176509737968445, 'actor/pg_clipfrac': 0.0013386880746111274, 'actor/ppo_kl': 0.0015374378999695182}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.13379041850566864, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010799856390804052}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.39765068888664246, 'actor/pg_clipfrac': 0.0017094017239287496, 'actor/ppo_kl': -0.0003091600083280355}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.040229540318250656, 'actor/pg_clipfrac': 0.0027985074557363987, 'actor/ppo_kl': 0.0004956775810569525}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 9.274481271859258e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00010164135892409831}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.8961145877838135, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003802708815783262}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1105448454618454, 'actor/pg_clipfrac': 0.0009569377871230245, 'actor/ppo_kl': 0.002281004562973976}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.07515621930360794, 'actor/pg_clipfrac': 0.0031120332423597574, 'actor/ppo_kl': 9.070392115972936e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.27655407786369324, 'actor/pg_clipfrac': 0.0013218771200627089, 'actor/ppo_kl': 0.000737668655347079}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00014527246821671724, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013198499800637364}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.9850698113441467, 'actor/pg_clipfrac': 0.000928505090996623, 'actor/ppo_kl': -0.0009115116554312408}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.3422924876213074, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006273284670896828}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.26036733388900757, 'actor/pg_clipfrac': 0.0010741138830780983, 'actor/ppo_kl': 0.00016064444207586348}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.23224525153636932, 'actor/pg_clipfrac': 0.0009737098589539528, 'actor/ppo_kl': 0.002187303500249982}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.20667709410190582, 'actor/pg_clipfrac': 0.0017605633474886417, 'actor/ppo_kl': 0.0015302756801247597}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.5510267019271851, 'actor/pg_clipfrac': 0.0015772870974615216, 'actor/ppo_kl': -0.0008325885282829404}
[36m(Runner pid=3309020)[0m Step 6
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.353
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.01
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.034
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:18<1:17:55, 3.67s/it, est. speed input: 124.90 toks/s, output: 21.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:24<46:23, 2.19s/it, est. speed input: 180.67 toks/s, output: 35.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<28:48, 1.37s/it, est. speed input: 252.97 toks/s, output: 56.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<18:57, 1.11it/s, est. speed input: 321.50 toks/s, output: 72.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<14:46, 1.42it/s, est. speed input: 382.20 toks/s, output: 86.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<11:31, 1.81it/s, est. speed input: 442.78 toks/s, output: 100.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<08:28, 2.45it/s, est. speed input: 509.64 toks/s, output: 120.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<06:19, 3.27it/s, est. speed input: 588.80 toks/s, output: 137.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<05:42, 3.59it/s, est. speed input: 685.31 toks/s, output: 163.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<04:47, 4.26it/s, est. speed input: 738.19 toks/s, output: 184.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:34<03:50, 5.30it/s, est. speed input: 797.98 toks/s, output: 204.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:34<02:21, 8.56it/s, est. speed input: 921.77 toks/s, output: 244.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:34<01:23, 14.24it/s, est. speed input: 1105.26 toks/s, output: 308.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:36<02:11, 9.02it/s, est. speed input: 1125.52 toks/s, output: 320.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:36<01:31, 12.91it/s, est. speed input: 1241.55 toks/s, output: 366.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:36<01:19, 14.79it/s, est. speed input: 1303.55 toks/s, output: 389.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:37<01:10, 16.59it/s, est. speed input: 1414.35 toks/s, output: 433.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<00:51, 22.57it/s, est. speed input: 1539.73 toks/s, output: 476.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:37<00:52, 21.71it/s, est. speed input: 1591.46 toks/s, output: 488.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:54, 21.03it/s, est. speed input: 1641.19 toks/s, output: 506.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<01:01, 18.42it/s, est. speed input: 1680.70 toks/s, output: 523.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:56, 20.15it/s, est. speed input: 1735.18 toks/s, output: 542.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:38<00:48, 23.41it/s, est. speed input: 1790.42 toks/s, output: 564.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:49, 22.71it/s, est. speed input: 1837.19 toks/s, output: 585.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:38<00:50, 22.32it/s, est. speed input: 1887.43 toks/s, output: 605.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:39<00:41, 26.54it/s, est. speed input: 1992.26 toks/s, output: 645.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:39<00:37, 29.34it/s, est. speed input: 2044.07 toks/s, output: 664.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:24, 44.33it/s, est. speed input: 2209.12 toks/s, output: 738.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:39<00:29, 37.17it/s, est. speed input: 2257.45 toks/s, output: 756.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:30, 35.38it/s, est. speed input: 2308.10 toks/s, output: 779.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:28, 37.20it/s, est. speed input: 2360.38 toks/s, output: 797.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:47, 22.67it/s, est. speed input: 2389.40 toks/s, output: 803.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:31, 33.39it/s, est. speed input: 2494.43 toks/s, output: 853.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:23, 44.28it/s, est. speed input: 2597.49 toks/s, output: 903.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:40<00:22, 45.81it/s, est. speed input: 2690.79 toks/s, output: 950.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:41<00:20, 51.00it/s, est. speed input: 2793.79 toks/s, output: 986.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:41<00:16, 60.46it/s, est. speed input: 2896.70 toks/s, output: 1036.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:41<00:19, 52.24it/s, est. speed input: 2989.76 toks/s, output: 1075.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:15, 64.91it/s, est. speed input: 3148.35 toks/s, output: 1148.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:41<00:20, 48.67it/s, est. speed input: 3237.34 toks/s, output: 1198.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:19, 50.73it/s, est. speed input: 3377.22 toks/s, output: 1279.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:42<00:14, 65.94it/s, est. speed input: 3575.40 toks/s, output: 1390.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:15, 60.36it/s, est. speed input: 3662.58 toks/s, output: 1444.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:42<00:16, 56.19it/s, est. speed input: 3747.61 toks/s, output: 1489.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:42<00:15, 60.97it/s, est. speed input: 3847.29 toks/s, output: 1531.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:43<00:13, 65.48it/s, est. speed input: 3943.43 toks/s, output: 1571.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:10, 88.68it/s, est. speed input: 4145.48 toks/s, output: 1651.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:09, 92.84it/s, est. speed input: 4338.08 toks/s, output: 1730.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:43<00:12, 69.37it/s, est. speed input: 4415.45 toks/s, output: 1768.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:43<00:12, 68.18it/s, est. speed input: 4500.04 toks/s, output: 1822.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:43<00:12, 67.34it/s, est. speed input: 4586.75 toks/s, output: 1856.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:44<00:13, 60.77it/s, est. speed input: 4669.72 toks/s, output: 1900.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:44<00:15, 54.22it/s, est. speed input: 4750.73 toks/s, output: 1951.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:44<00:12, 63.67it/s, est. speed input: 4891.06 toks/s, output: 2013.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:44<00:14, 54.77it/s, est. speed input: 4972.65 toks/s, output: 2061.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:44<00:11, 68.24it/s, est. speed input: 5114.15 toks/s, output: 2141.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:45<00:11, 67.11it/s, est. speed input: 5277.06 toks/s, output: 2227.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:45<00:09, 75.14it/s, est. speed input: 5457.08 toks/s, output: 2328.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:45<00:09, 77.15it/s, est. speed input: 5543.50 toks/s, output: 2379.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:45<00:10, 65.51it/s, est. speed input: 5620.39 toks/s, output: 2423.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:45<00:08, 83.09it/s, est. speed input: 5809.21 toks/s, output: 2537.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:46<00:09, 70.53it/s, est. speed input: 5886.19 toks/s, output: 2581.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:46<00:09, 70.14it/s, est. speed input: 5969.43 toks/s, output: 2624.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:46<00:07, 84.18it/s, est. speed input: 6156.52 toks/s, output: 2741.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:46<00:08, 79.77it/s, est. speed input: 6275.88 toks/s, output: 2830.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:46<00:06, 90.30it/s, est. speed input: 6404.64 toks/s, output: 2908.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:46<00:07, 86.38it/s, est. speed input: 6482.19 toks/s, output: 2958.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:47<00:04, 134.42it/s, est. speed input: 6810.69 toks/s, output: 3166.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:47<00:03, 141.29it/s, est. speed input: 6991.08 toks/s, output: 3273.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:47<00:04, 118.55it/s, est. speed input: 7110.62 toks/s, output: 3360.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:47<00:04, 129.88it/s, est. speed input: 7329.60 toks/s, output: 3524.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:47<00:02, 187.16it/s, est. speed input: 7695.70 toks/s, output: 3764.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:47<00:02, 185.51it/s, est. speed input: 7925.40 toks/s, output: 3909.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:48<00:03, 131.62it/s, est. speed input: 8105.12 toks/s, output: 4054.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:48<00:02, 169.28it/s, est. speed input: 8429.37 toks/s, output: 4273.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:48<00:02, 166.32it/s, est. speed input: 8638.25 toks/s, output: 4408.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:48<00:02, 126.88it/s, est. speed input: 8783.22 toks/s, output: 4507.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:48<00:03, 106.85it/s, est. speed input: 8929.07 toks/s, output: 4593.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:48<00:02, 135.65it/s, est. speed input: 9191.24 toks/s, output: 4781.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:49<00:01, 186.17it/s, est. speed input: 9595.73 toks/s, output: 5076.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:49<00:01, 162.93it/s, est. speed input: 9796.55 toks/s, output: 5220.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:49<00:01, 129.73it/s, est. speed input: 9985.57 toks/s, output: 5330.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:49<00:01, 123.33it/s, est. speed input: 10188.50 toks/s, output: 5507.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:49<00:01, 114.39it/s, est. speed input: 10288.34 toks/s, output: 5587.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:50<00:01, 115.35it/s, est. speed input: 10401.74 toks/s, output: 5685.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:50<00:01, 128.21it/s, est. speed input: 10610.63 toks/s, output: 5881.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:50<00:01, 116.69it/s, est. speed input: 10716.53 toks/s, output: 5980.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:50<00:01, 99.69it/s, est. speed input: 10842.14 toks/s, output: 6109.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:50<00:00, 94.17it/s, est. speed input: 10937.12 toks/s, output: 6204.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:51<00:01, 66.89it/s, est. speed input: 10963.43 toks/s, output: 6248.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:51<00:01, 53.70it/s, est. speed input: 10987.84 toks/s, output: 6297.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:51<00:01, 50.30it/s, est. speed input: 11025.79 toks/s, output: 6348.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:52<00:01, 36.29it/s, est. speed input: 11005.80 toks/s, output: 6369.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:52<00:01, 36.43it/s, est. speed input: 11025.20 toks/s, output: 6406.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:52<00:00, 37.17it/s, est. speed input: 11045.10 toks/s, output: 6446.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:52<00:00, 37.94it/s, est. speed input: 11064.45 toks/s, output: 6481.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 44.39it/s, est. speed input: 11124.91 toks/s, output: 6556.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:53<00:00, 34.53it/s, est. speed input: 11129.53 toks/s, output: 6592.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:53<00:00, 36.07it/s, est. speed input: 11149.19 toks/s, output: 6623.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 25.80it/s, est. speed input: 11107.98 toks/s, output: 6628.93 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 23.82it/s, est. speed input: 11107.98 toks/s, output: 6628.93 toks/s]
[36m(Runner pid=3309020)[0m ppo_kl: 1.3440168846301503e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.058
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.058
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.574
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.574
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 950008
[36m(Runner pid=3309020)[0m balanced_min: 950007
[36m(Runner pid=3309020)[0m max: 952972
[36m(Runner pid=3309020)[0m mean: 950007.5
[36m(Runner pid=3309020)[0m min: 947043
[36m(Runner pid=3309020)[0m minmax_diff: 5929
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.483
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 70.736
[36m(Runner pid=3309020)[0m mfu_actor: 0.116
[36m(Runner pid=3309020)[0m throughput: 1073.143
[36m(Runner pid=3309020)[0m time_per_step: 885.257
[36m(Runner pid=3309020)[0m total_num_tokens: 1900015
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 468.33
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2010.0
[36m(Runner pid=3309020)[0m mean: 273.863
[36m(Runner pid=3309020)[0m min: 49.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.213
[36m(Runner pid=3309020)[0m format: 0.913
[36m(Runner pid=3309020)[0m overall: 0.574
[36m(Runner pid=3309020)[0m tag_reward: 0.969
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.098498539239905e-05
[36m(Runner pid=3309020)[0m gen: 0.194
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.298
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.154
[36m(Runner pid=3309020)[0m gen: 135.899
[36m(Runner pid=3309020)[0m old: 87.603
[36m(Runner pid=3309020)[0m ref: 88.689
[36m(Runner pid=3309020)[0m reward: 6.642
[36m(Runner pid=3309020)[0m step: 885.257
[36m(Runner pid=3309020)[0m update_actor: 565.658
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 7; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.58 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:02:24 [executor_base.py:219] It took 0.338679 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.50 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:03:52 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:02:24 [executor_base.py:219] It took 0.337987 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:03:53 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:03:53 [executor_base.py:208] It took 0.327928 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.76 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.84 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:03:56 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:03:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:03:56 [executor_base.py:208] It took 0.326980 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.39937010407447815, 'actor/pg_clipfrac': 0.002742230426520109, 'actor/ppo_kl': -0.00012376060476526618}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2763735353946686, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.5423915982246399, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003596301539801061}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.2024582177400589, 'actor/pg_clipfrac': 0.0018382353009656072, 'actor/ppo_kl': 0.0026247536297887564}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.20250454545021057, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00014655824634246528, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.06832382827997208, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0029702179599553347}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 9.632856381358579e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.18679824471473694, 'actor/pg_clipfrac': 0.001746216556057334, 'actor/ppo_kl': 0.0007283218437805772}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.8803308606147766, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.4861737787723541, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.34150993824005127, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 8.950653864303604e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.32979071140289307, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00010444092185935006, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00021597703744191676, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011428921716287732}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.18961890041828156, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006967073422856629}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.36964520812034607, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005337664624676108}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.1944696456193924, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006727154832333326}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.1646435409784317, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0022105660755187273}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.10763544589281082, 'actor/pg_clipfrac': 0.0016246953746303916, 'actor/ppo_kl': 0.00047508941497653723}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4323216676712036, 'actor/pg_clipfrac': 0.0023952096235007048, 'actor/ppo_kl': -0.00045331165893003345}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.24781611561775208, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012080250307917595}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.36214521527290344, 'actor/pg_clipfrac': 0.0021528524812310934, 'actor/ppo_kl': 0.0020109470933675766}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3839026987552643, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006913396064192057}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.08910579979419708, 'actor/pg_clipfrac': 0.0021598271559923887, 'actor/ppo_kl': -0.00038365446380339563}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.21215349435806274, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004437925526872277}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.10074832290410995, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001451694843126461}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.18433156609535217, 'actor/pg_clipfrac': 0.0014727540547028184, 'actor/ppo_kl': 0.00017876127094496042}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.19666171073913574, 'actor/pg_clipfrac': 0.0030534351244568825, 'actor/ppo_kl': -0.0005269189132377505}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.4933091700077057, 'actor/pg_clipfrac': 0.001953125, 'actor/ppo_kl': 0.001010645180940628}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.9252896308898926, 'actor/pg_clipfrac': 0.0013166556600481272, 'actor/ppo_kl': 0.0006856020772829652}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.3306176960468292, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006895926198922098}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.09099049121141434, 'actor/pg_clipfrac': 0.0006738544325344265, 'actor/ppo_kl': -9.425566531717777e-05}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.14539991319179535, 'actor/pg_clipfrac': 0.0011337868636474013, 'actor/ppo_kl': 0.0014204729814082384}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.3816397190093994, 'actor/pg_clipfrac': 0.0006596306338906288, 'actor/ppo_kl': -0.0011352587025612593}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0001257319818250835, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007337082643061876}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3420097827911377, 'actor/pg_clipfrac': 0.0007782101165503263, 'actor/ppo_kl': -0.000739375886041671}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1855221539735794, 'actor/pg_clipfrac': 0.0013755158288404346, 'actor/ppo_kl': -0.002858121180906892}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0001080203874153085, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00044854357838630676}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.12279783934354782, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004674227093346417}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00015783093112986535, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018417721148580313}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.22181016206741333, 'actor/pg_clipfrac': 0.0025466892402619123, 'actor/ppo_kl': 0.00045814740587957203}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.4475213289260864, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.148291868157685e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00018609354447107762, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007824510685168207}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.15577949583530426, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00012289162259548903}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00014950674085412174, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00020013994071632624}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.32317212224006653, 'actor/pg_clipfrac': 0.0056134723126888275, 'actor/ppo_kl': 0.0011080456897616386}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00023315736325457692, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008606379269622266}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.2889992892742157, 'actor/pg_clipfrac': 0.0011166945332661271, 'actor/ppo_kl': -0.00018468480266164988}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.036274127662181854, 'actor/pg_clipfrac': 0.0011025358689948916, 'actor/ppo_kl': -0.002063002670183778}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.16430774331092834, 'actor/pg_clipfrac': 0.0012755101779475808, 'actor/ppo_kl': 0.0008161627338267863}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00016149328439496458, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011918296804651618}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.3400568664073944, 'actor/pg_clipfrac': 0.002572898752987385, 'actor/ppo_kl': 0.00012224371312186122}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.09754572808742523, 'actor/pg_clipfrac': 0.002797202905640006, 'actor/ppo_kl': 0.0008525074808858335}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.296344518661499, 'actor/pg_clipfrac': 0.002775208093225956, 'actor/ppo_kl': -0.00010922173532890156}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0909242108464241, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014152751537039876}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.25222426652908325, 'actor/pg_clipfrac': 0.001336898421868682, 'actor/ppo_kl': 0.0005099403788335621}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.26748761534690857, 'actor/pg_clipfrac': 0.0013210040051490068, 'actor/ppo_kl': 0.00011264695058343932}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:19<1:24:25, 3.97s/it, est. speed input: 116.03 toks/s, output: 17.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:20<35:09, 1.66s/it, est. speed input: 230.62 toks/s, output: 38.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:22<24:25, 1.16s/it, est. speed input: 301.30 toks/s, output: 54.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:24<17:20, 1.21it/s, est. speed input: 375.69 toks/s, output: 69.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:24<11:41, 1.79it/s, est. speed input: 456.88 toks/s, output: 89.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:26<09:42, 2.15it/s, est. speed input: 520.36 toks/s, output: 103.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:26<06:47, 3.05it/s, est. speed input: 603.24 toks/s, output: 124.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:29<08:12, 2.52it/s, est. speed input: 636.86 toks/s, output: 134.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:29<06:30, 3.16it/s, est. speed input: 703.34 toks/s, output: 148.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:30<03:33, 5.73it/s, est. speed input: 850.05 toks/s, output: 190.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:30<03:06, 6.53it/s, est. speed input: 909.51 toks/s, output: 204.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:30<01:55, 10.50it/s, est. speed input: 1055.23 toks/s, output: 247.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:31<01:37, 12.25it/s, est. speed input: 1187.80 toks/s, output: 285.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:31<01:27, 13.62it/s, est. speed input: 1249.58 toks/s, output: 303.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:31<01:22, 14.37it/s, est. speed input: 1304.46 toks/s, output: 318.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:32<01:14, 15.94it/s, est. speed input: 1364.15 toks/s, output: 336.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:32<01:11, 16.44it/s, est. speed input: 1424.76 toks/s, output: 353.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:32<00:54, 21.47it/s, est. speed input: 1550.47 toks/s, output: 393.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:33<01:12, 16.15it/s, est. speed input: 1596.41 toks/s, output: 408.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:33<00:55, 20.91it/s, est. speed input: 1722.89 toks/s, output: 446.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:33<00:33, 33.56it/s, est. speed input: 1957.00 toks/s, output: 516.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:33<00:34, 32.58it/s, est. speed input: 2072.81 toks/s, output: 554.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:33<00:33, 33.68it/s, est. speed input: 2132.41 toks/s, output: 576.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:34<00:26, 42.25it/s, est. speed input: 2313.33 toks/s, output: 638.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:34<00:28, 38.86it/s, est. speed input: 2368.78 toks/s, output: 660.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:34<00:30, 36.15it/s, est. speed input: 2420.11 toks/s, output: 676.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:34<00:23, 45.64it/s, est. speed input: 2540.71 toks/s, output: 727.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:34<00:21, 49.18it/s, est. speed input: 2655.51 toks/s, output: 768.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:35<00:22, 46.78it/s, est. speed input: 2767.81 toks/s, output: 805.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:35<00:16, 63.62it/s, est. speed input: 2956.08 toks/s, output: 869.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:35<00:18, 56.37it/s, est. speed input: 3069.77 toks/s, output: 910.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:35<00:14, 72.05it/s, est. speed input: 3260.24 toks/s, output: 977.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:35<00:13, 75.92it/s, est. speed input: 3378.59 toks/s, output: 1018.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:35<00:18, 54.89it/s, est. speed input: 3470.81 toks/s, output: 1056.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:36<00:17, 57.32it/s, est. speed input: 3588.12 toks/s, output: 1101.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:36<00:25, 39.24it/s, est. speed input: 3672.50 toks/s, output: 1135.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:36<00:27, 35.54it/s, est. speed input: 3763.18 toks/s, output: 1182.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:37<00:29, 33.42it/s, est. speed input: 3803.33 toks/s, output: 1194.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:37<00:27, 35.73it/s, est. speed input: 3901.98 toks/s, output: 1247.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:37<00:28, 33.42it/s, est. speed input: 3946.73 toks/s, output: 1265.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:37<00:23, 41.24it/s, est. speed input: 4060.02 toks/s, output: 1309.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:37<00:23, 40.80it/s, est. speed input: 4108.03 toks/s, output: 1319.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:37<00:17, 54.40it/s, est. speed input: 4278.37 toks/s, output: 1390.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:38<00:15, 60.11it/s, est. speed input: 4384.43 toks/s, output: 1433.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:38<00:16, 56.11it/s, est. speed input: 4474.85 toks/s, output: 1473.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:38<00:10, 80.96it/s, est. speed input: 4699.81 toks/s, output: 1577.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:38<00:09, 87.96it/s, est. speed input: 4932.11 toks/s, output: 1658.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:38<00:08, 96.60it/s, est. speed input: 5091.82 toks/s, output: 1743.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:39<00:10, 79.21it/s, est. speed input: 5238.90 toks/s, output: 1807.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:39<00:11, 73.20it/s, est. speed input: 5331.10 toks/s, output: 1832.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:39<00:10, 77.74it/s, est. speed input: 5534.55 toks/s, output: 1935.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:39<00:11, 70.46it/s, est. speed input: 5771.04 toks/s, output: 2035.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:39<00:07, 96.18it/s, est. speed input: 6098.57 toks/s, output: 2191.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:40<00:06, 109.37it/s, est. speed input: 6305.52 toks/s, output: 2307.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:40<00:05, 130.88it/s, est. speed input: 6566.23 toks/s, output: 2428.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:40<00:05, 115.06it/s, est. speed input: 6752.22 toks/s, output: 2539.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:40<00:06, 105.02it/s, est. speed input: 6894.03 toks/s, output: 2618.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:40<00:06, 109.19it/s, est. speed input: 7046.35 toks/s, output: 2694.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:40<00:04, 132.73it/s, est. speed input: 7312.92 toks/s, output: 2822.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:40<00:03, 174.42it/s, est. speed input: 7682.80 toks/s, output: 3038.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:41<00:03, 153.83it/s, est. speed input: 7872.69 toks/s, output: 3140.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:41<00:04, 135.79it/s, est. speed input: 8055.63 toks/s, output: 3244.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:41<00:03, 159.22it/s, est. speed input: 8313.83 toks/s, output: 3421.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:41<00:02, 184.11it/s, est. speed input: 8634.72 toks/s, output: 3598.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:41<00:03, 122.62it/s, est. speed input: 8843.07 toks/s, output: 3732.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:42<00:04, 101.37it/s, est. speed input: 9017.42 toks/s, output: 3822.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:42<00:04, 107.59it/s, est. speed input: 9163.97 toks/s, output: 3901.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:42<00:04, 104.58it/s, est. speed input: 9294.68 toks/s, output: 3985.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:42<00:02, 139.47it/s, est. speed input: 9596.43 toks/s, output: 4166.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:42<00:02, 147.61it/s, est. speed input: 9836.35 toks/s, output: 4326.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:42<00:01, 175.77it/s, est. speed input: 10142.38 toks/s, output: 4500.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:42<00:01, 183.59it/s, est. speed input: 10383.44 toks/s, output: 4672.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:43<00:01, 174.57it/s, est. speed input: 10619.73 toks/s, output: 4800.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:43<00:01, 143.41it/s, est. speed input: 10774.46 toks/s, output: 4898.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:43<00:01, 171.40it/s, est. speed input: 11061.49 toks/s, output: 5108.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:43<00:01, 153.67it/s, est. speed input: 11235.56 toks/s, output: 5229.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:43<00:01, 164.32it/s, est. speed input: 11473.92 toks/s, output: 5415.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:43<00:01, 161.74it/s, est. speed input: 11654.73 toks/s, output: 5536.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:44<00:01, 144.93it/s, est. speed input: 11826.79 toks/s, output: 5667.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:44<00:01, 126.31it/s, est. speed input: 11985.69 toks/s, output: 5800.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:44<00:01, 96.04it/s, est. speed input: 12062.36 toks/s, output: 5872.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:44<00:01, 87.28it/s, est. speed input: 12163.68 toks/s, output: 5975.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:44<00:00, 101.20it/s, est. speed input: 12337.72 toks/s, output: 6093.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:45<00:01, 66.35it/s, est. speed input: 12368.25 toks/s, output: 6166.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:45<00:00, 76.69it/s, est. speed input: 12489.98 toks/s, output: 6275.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:45<00:00, 60.32it/s, est. speed input: 12541.44 toks/s, output: 6364.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:45<00:00, 63.66it/s, est. speed input: 12615.75 toks/s, output: 6423.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:46<00:00, 65.67it/s, est. speed input: 12673.36 toks/s, output: 6487.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:46<00:00, 69.71it/s, est. speed input: 12745.31 toks/s, output: 6570.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:46<00:00, 36.04it/s, est. speed input: 12664.02 toks/s, output: 6569.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:47<00:00, 27.17it/s, est. speed input: 12659.74 toks/s, output: 6599.60 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.11784827709197998, 'actor/pg_clipfrac': 0.005633802618831396, 'actor/ppo_kl': -0.0008553867810405791}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.39716529846191406, 'actor/pg_clipfrac': 0.0011820330983027816, 'actor/ppo_kl': 0.0025425453204661608}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.056508444249629974, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008339958149008453}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0014394018799066544, 'actor/pg_clipfrac': 0.0023809524718672037, 'actor/ppo_kl': -0.0001559590600663796}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.07516376674175262, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006207207916304469}
[36m(Runner pid=3309020)[0m Step 7
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.327
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.013
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.029
[36m(Runner pid=3309020)[0m ppo_kl: 0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.046
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.046
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.593
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.593
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 937785
[36m(Runner pid=3309020)[0m balanced_min: 937784
[36m(Runner pid=3309020)[0m max: 953102
[36m(Runner pid=3309020)[0m mean: 937784.5
[36m(Runner pid=3309020)[0m min: 922467
[36m(Runner pid=3309020)[0m minmax_diff: 30635
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.995
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 73.999
[36m(Runner pid=3309020)[0m mfu_actor: 0.112
[36m(Runner pid=3309020)[0m throughput: 1077.75
[36m(Runner pid=3309020)[0m time_per_step: 870.132
[36m(Runner pid=3309020)[0m total_num_tokens: 1875569
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 464.586
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1134.0
[36m(Runner pid=3309020)[0m mean: 268.058
[36m(Runner pid=3309020)[0m min: 43.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.22
[36m(Runner pid=3309020)[0m format: 0.955
[36m(Runner pid=3309020)[0m overall: 0.593
[36m(Runner pid=3309020)[0m tag_reward: 0.983
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.975987660524455e-05
[36m(Runner pid=3309020)[0m gen: 0.159
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.309
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.15
[36m(Runner pid=3309020)[0m gen: 109.444
[36m(Runner pid=3309020)[0m old: 86.841
[36m(Runner pid=3309020)[0m ref: 87.697
[36m(Runner pid=3309020)[0m reward: 6.133
[36m(Runner pid=3309020)[0m step: 870.132
[36m(Runner pid=3309020)[0m update_actor: 579.218
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 8; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:16:55 [executor_base.py:219] It took 0.339571 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.75 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:18:17 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:16:55 [executor_base.py:219] It took 0.337539 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:18:17 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:18:17 [executor_base.py:208] It took 0.325921 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:18:18 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:18:18 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:18:18 [executor_base.py:208] It took 0.328730 seconds to fall asleep.
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0001438392064301297, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010997100034728646}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.2496393620967865, 'actor/pg_clipfrac': 0.001123595517128706, 'actor/ppo_kl': -0.0010379544692113996}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.193949356675148, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015434481902047992}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.051588669419288635, 'actor/pg_clipfrac': 0.004166666883975267, 'actor/ppo_kl': -0.0019204934360459447}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002057441306533292, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00024464537273161113}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.2964847981929779, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.23309001326560974, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00014149770140647888, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3014861047267914, 'actor/pg_clipfrac': 0.0009328357991762459, 'actor/ppo_kl': 0.0003370242193341255}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.2831544876098633, 'actor/pg_clipfrac': 0.001661129528656602, 'actor/ppo_kl': -6.619957275688648e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00012116388097638264, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001908628037199378}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0001594203495187685, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00019331852672621608}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.6074821352958679, 'actor/pg_clipfrac': 0.0017123287543654442, 'actor/ppo_kl': -0.0009437325643375516}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00013313902309164405, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0020137163810431957}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.06368762999773026, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.23152826726436615, 'actor/pg_clipfrac': 0.001673173508606851, 'actor/ppo_kl': -3.189197514075204e-06}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.11708042025566101, 'actor/pg_clipfrac': 0.0009442870505154133, 'actor/ppo_kl': 0.00027657815371640027}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.3829173445701599, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -2.5844672563835047e-05}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.26480117440223694, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011214743135496974}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0001389867247780785, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004236424283590168}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3008841574192047, 'actor/pg_clipfrac': 0.004552352242171764, 'actor/ppo_kl': 0.0033597180154174566}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 9.759395470609888e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004619633255060762}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.21046197414398193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.101575127104297e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.4147185981273651, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00020212557865306735}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.000142324497574009, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00028333664522506297}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00012369461182970554, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00015946966595947742}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.34277668595314026, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0019785058684647083}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00015212454309221357, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005295941373333335}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.5503167510032654, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008455942734144628}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00026378920301795006, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007084052776917815}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00011285584332654253, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006131650297902524}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0001067951088771224, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006055798148736358}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.17052273452281952, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0023179291747510433}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.21566374599933624, 'actor/pg_clipfrac': 0.002051281975582242, 'actor/ppo_kl': -0.0004554787592496723}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.28346118330955505, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00141046941280365}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.13158589601516724, 'actor/pg_clipfrac': 0.0007446016534231603, 'actor/ppo_kl': -0.001122692716307938}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.19691675901412964, 'actor/pg_clipfrac': 0.004297994077205658, 'actor/ppo_kl': 0.0025716114323586226}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.2855181396007538, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016009309329092503}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00018611212726682425, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002403927966952324}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.4711097180843353, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011244146153330803}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.007492370437830687, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006406169850379229}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.19557678699493408, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009330659522674978}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.2629200220108032, 'actor/pg_clipfrac': 0.0011467889416962862, 'actor/ppo_kl': 0.0006855660467408597}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2609308063983917, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006336886435747147}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 9.878520359052345e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007760970038361847}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0001070084108505398, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010124959517270327}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.20481932163238525, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002502746065147221}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.43944987654685974, 'actor/pg_clipfrac': 0.0017301038606092334, 'actor/ppo_kl': 0.001684121205471456}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00012154621799709275, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004169447929598391}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.26295238733291626, 'actor/pg_clipfrac': 0.0010020040208473802, 'actor/ppo_kl': -0.0004970114678144455}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.16647008061408997, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006479710573330522}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:19<1:23:26, 3.93s/it, est. speed input: 110.52 toks/s, output: 20.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:20<36:11, 1.71s/it, est. speed input: 220.58 toks/s, output: 42.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<30:52, 1.46s/it, est. speed input: 254.13 toks/s, output: 51.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:26<18:57, 1.11it/s, est. speed input: 339.27 toks/s, output: 72.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:27<13:36, 1.54it/s, est. speed input: 408.40 toks/s, output: 89.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:27<09:20, 2.23it/s, est. speed input: 485.28 toks/s, output: 107.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:28<07:22, 2.82it/s, est. speed input: 559.82 toks/s, output: 126.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:28<05:13, 3.96it/s, est. speed input: 634.14 toks/s, output: 145.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:28<03:46, 5.45it/s, est. speed input: 705.40 toks/s, output: 164.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:29<02:12, 9.28it/s, est. speed input: 863.39 toks/s, output: 207.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:29<01:36, 12.54it/s, est. speed input: 1010.85 toks/s, output: 248.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:29<01:07, 17.81it/s, est. speed input: 1166.16 toks/s, output: 288.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:30<01:15, 15.87it/s, est. speed input: 1224.36 toks/s, output: 305.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:30<01:14, 16.13it/s, est. speed input: 1292.27 toks/s, output: 324.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:31<01:33, 12.72it/s, est. speed input: 1336.45 toks/s, output: 339.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:31<01:04, 18.10it/s, est. speed input: 1612.36 toks/s, output: 414.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:31<00:58, 19.83it/s, est. speed input: 1678.47 toks/s, output: 437.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:32<00:44, 26.18it/s, est. speed input: 1808.45 toks/s, output: 478.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:32<00:39, 29.01it/s, est. speed input: 1941.96 toks/s, output: 521.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:32<00:23, 48.04it/s, est. speed input: 2343.22 toks/s, output: 656.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:32<00:21, 51.26it/s, est. speed input: 2535.74 toks/s, output: 719.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:33<00:18, 57.91it/s, est. speed input: 2743.13 toks/s, output: 776.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:33<00:20, 53.63it/s, est. speed input: 2860.25 toks/s, output: 821.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:33<00:19, 54.39it/s, est. speed input: 2985.68 toks/s, output: 867.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:33<00:15, 67.70it/s, est. speed input: 3174.76 toks/s, output: 931.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:34<00:27, 37.97it/s, est. speed input: 3249.38 toks/s, output: 961.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:34<00:26, 39.55it/s, est. speed input: 3363.04 toks/s, output: 1010.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:34<00:21, 46.99it/s, est. speed input: 3547.55 toks/s, output: 1074.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:34<00:19, 50.19it/s, est. speed input: 3728.57 toks/s, output: 1142.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:35<00:21, 47.09it/s, est. speed input: 3837.73 toks/s, output: 1184.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:35<00:17, 54.45it/s, est. speed input: 3957.94 toks/s, output: 1232.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:35<00:16, 57.29it/s, est. speed input: 4072.41 toks/s, output: 1275.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:35<00:15, 63.11it/s, est. speed input: 4249.63 toks/s, output: 1345.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:35<00:12, 72.74it/s, est. speed input: 4424.77 toks/s, output: 1426.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:36<00:19, 48.50it/s, est. speed input: 4488.30 toks/s, output: 1464.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:36<00:17, 51.59it/s, est. speed input: 4660.58 toks/s, output: 1513.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:36<00:14, 60.06it/s, est. speed input: 4828.55 toks/s, output: 1572.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:36<00:16, 53.66it/s, est. speed input: 4918.77 toks/s, output: 1620.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:37<00:16, 52.03it/s, est. speed input: 5109.72 toks/s, output: 1708.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:37<00:13, 60.93it/s, est. speed input: 5372.23 toks/s, output: 1813.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:37<00:14, 55.41it/s, est. speed input: 5509.06 toks/s, output: 1873.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:38<00:12, 66.76it/s, est. speed input: 5676.18 toks/s, output: 1955.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:38<00:11, 68.17it/s, est. speed input: 5777.34 toks/s, output: 1987.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:38<00:10, 73.32it/s, est. speed input: 5879.11 toks/s, output: 2030.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:38<00:08, 91.74it/s, est. speed input: 6102.62 toks/s, output: 2137.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:38<00:08, 92.25it/s, est. speed input: 6262.47 toks/s, output: 2210.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:38<00:06, 109.21it/s, est. speed input: 6482.50 toks/s, output: 2297.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:38<00:05, 123.51it/s, est. speed input: 6695.95 toks/s, output: 2412.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:38<00:05, 123.89it/s, est. speed input: 6852.79 toks/s, output: 2485.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:39<00:05, 116.68it/s, est. speed input: 7001.45 toks/s, output: 2558.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:39<00:08, 82.56it/s, est. speed input: 7122.32 toks/s, output: 2620.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:39<00:06, 101.63it/s, est. speed input: 7340.03 toks/s, output: 2716.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:39<00:05, 110.21it/s, est. speed input: 7501.35 toks/s, output: 2801.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:39<00:04, 135.03it/s, est. speed input: 7766.45 toks/s, output: 2952.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:39<00:04, 137.25it/s, est. speed input: 7978.54 toks/s, output: 3075.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:40<00:05, 104.12it/s, est. speed input: 8103.83 toks/s, output: 3164.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:40<00:04, 135.28it/s, est. speed input: 8417.14 toks/s, output: 3311.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:40<00:04, 129.57it/s, est. speed input: 8619.02 toks/s, output: 3422.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:40<00:02, 185.33it/s, est. speed input: 9052.39 toks/s, output: 3658.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:40<00:03, 130.59it/s, est. speed input: 9261.53 toks/s, output: 3782.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:41<00:03, 133.70it/s, est. speed input: 9460.55 toks/s, output: 3897.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:41<00:03, 137.78it/s, est. speed input: 9660.38 toks/s, output: 4033.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:41<00:02, 141.21it/s, est. speed input: 9867.39 toks/s, output: 4122.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:41<00:02, 144.74it/s, est. speed input: 10062.92 toks/s, output: 4234.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:41<00:02, 148.53it/s, est. speed input: 10247.14 toks/s, output: 4346.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:41<00:02, 139.38it/s, est. speed input: 10419.31 toks/s, output: 4440.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:41<00:02, 155.69it/s, est. speed input: 10664.44 toks/s, output: 4602.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:41<00:01, 168.47it/s, est. speed input: 10908.93 toks/s, output: 4741.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:42<00:02, 115.20it/s, est. speed input: 11046.23 toks/s, output: 4825.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:42<00:02, 100.93it/s, est. speed input: 11177.10 toks/s, output: 4907.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:42<00:02, 94.27it/s, est. speed input: 11289.72 toks/s, output: 4997.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:42<00:01, 121.04it/s, est. speed input: 11579.74 toks/s, output: 5205.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:42<00:01, 115.50it/s, est. speed input: 11707.37 toks/s, output: 5301.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:43<00:01, 101.57it/s, est. speed input: 11814.87 toks/s, output: 5390.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:43<00:01, 101.59it/s, est. speed input: 11984.52 toks/s, output: 5518.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:43<00:01, 86.32it/s, est. speed input: 12072.62 toks/s, output: 5595.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:43<00:01, 95.95it/s, est. speed input: 12223.21 toks/s, output: 5697.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:44<00:01, 75.94it/s, est. speed input: 12295.57 toks/s, output: 5771.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:44<00:01, 60.04it/s, est. speed input: 12320.56 toks/s, output: 5817.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:44<00:01, 65.95it/s, est. speed input: 12395.17 toks/s, output: 5885.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:44<00:01, 76.08it/s, est. speed input: 12516.43 toks/s, output: 6014.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:44<00:00, 90.24it/s, est. speed input: 12687.74 toks/s, output: 6187.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:44<00:00, 125.65it/s, est. speed input: 12980.30 toks/s, output: 6435.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:45<00:00, 83.49it/s, est. speed input: 13027.63 toks/s, output: 6533.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:45<00:00, 47.12it/s, est. speed input: 12988.52 toks/s, output: 6558.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:47<00:00, 27.22it/s, est. speed input: 12742.64 toks/s, output: 6447.45 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00024435625527985394, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00010741640289779752}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.12519042193889618, 'actor/pg_clipfrac': 0.0025445292703807354, 'actor/ppo_kl': -6.289639713941142e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.008720902726054192, 'actor/pg_clipfrac': 0.0018115942366421223, 'actor/ppo_kl': 0.0006111290422268212}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.33300864696502686, 'actor/pg_clipfrac': 0.0011876485077664256, 'actor/ppo_kl': 0.0013655778020620346}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.024001911282539368, 'actor/pg_clipfrac': 0.0026881720405071974, 'actor/ppo_kl': 0.00035733048571273685}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.2106098234653473, 'actor/pg_clipfrac': 0.002366863889619708, 'actor/ppo_kl': -0.001295175519771874}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.11070843786001205, 'actor/pg_clipfrac': 0.0008818341884762049, 'actor/ppo_kl': -0.00038507921271957457}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.04314517229795456, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008525848388671875}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.4315118193626404, 'actor/pg_clipfrac': 0.0016313213855028152, 'actor/ppo_kl': 0.0009878339478746057}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.45743122696876526, 'actor/pg_clipfrac': 0.0010493178851902485, 'actor/ppo_kl': -0.0005093261133879423}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.904799222946167, 'actor/pg_clipfrac': 0.001212121220305562, 'actor/ppo_kl': -0.0013116431655362248}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.04750103875994682, 'actor/pg_clipfrac': 0.00374531839042902, 'actor/ppo_kl': -0.0012025035684928298}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.15323993563652039, 'actor/pg_clipfrac': 0.0006119951140135527, 'actor/ppo_kl': -0.0007210664916783571}
[36m(Runner pid=3309020)[0m Step 8
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.328
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.014
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.015
[36m(Runner pid=3309020)[0m ppo_kl: -2.9840297358418865e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.026
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.026
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.601
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.601
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 910295
[36m(Runner pid=3309020)[0m balanced_min: 910294
[36m(Runner pid=3309020)[0m max: 913282
[36m(Runner pid=3309020)[0m mean: 910294.5
[36m(Runner pid=3309020)[0m min: 907307
[36m(Runner pid=3309020)[0m minmax_diff: 5975
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.638
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 73.999
[36m(Runner pid=3309020)[0m mfu_actor: 0.112
[36m(Runner pid=3309020)[0m throughput: 1077.746
[36m(Runner pid=3309020)[0m time_per_step: 844.628
[36m(Runner pid=3309020)[0m total_num_tokens: 1820589
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 671.0
[36m(Runner pid=3309020)[0m mean: 464.26
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1302.0
[36m(Runner pid=3309020)[0m mean: 246.908
[36m(Runner pid=3309020)[0m min: 50.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.223
[36m(Runner pid=3309020)[0m format: 0.972
[36m(Runner pid=3309020)[0m overall: 0.601
[36m(Runner pid=3309020)[0m tag_reward: 0.99
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.362909201248956e-05
[36m(Runner pid=3309020)[0m gen: 0.159
[36m(Runner pid=3309020)[0m old: 0.047
[36m(Runner pid=3309020)[0m ref: 0.049
[36m(Runner pid=3309020)[0m reward: 0.011
[36m(Runner pid=3309020)[0m update_actor: 0.309
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.134
[36m(Runner pid=3309020)[0m gen: 100.628
[36m(Runner pid=3309020)[0m old: 86.172
[36m(Runner pid=3309020)[0m ref: 88.315
[36m(Runner pid=3309020)[0m reward: 6.993
[36m(Runner pid=3309020)[0m step: 844.628
[36m(Runner pid=3309020)[0m update_actor: 561.805
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 9; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.67 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.04 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:31:02 [executor_base.py:219] It took 0.339256 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.96 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.59 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.79 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:32:23 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:31:01 [executor_base.py:219] It took 0.338290 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:32:23 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.87 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:32:23 [executor_base.py:208] It took 0.326503 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.87 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:32:39 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:32:39 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:32:39 [executor_base.py:208] It took 0.328445 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.3512565493583679, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00016474552103318274}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00020288460655137897, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.28148171305656433, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0844496414065361, 'actor/pg_clipfrac': 0.002288329415023327, 'actor/ppo_kl': -0.0013352181995287538}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.2596668601036072, 'actor/pg_clipfrac': 0.0032362460624426603, 'actor/ppo_kl': -0.0024032345972955227}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0001444337540306151, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001234455849044025}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.371044397354126, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00016001479525584728, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005436456413008273}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.03830854967236519, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00012464857718441635}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00010838780144695193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00014254223788157105, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.15604394674301147, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009460995206609368}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00016745293396525085, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014268531231209636}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00015858668484725058, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.1665230244398117, 'actor/pg_clipfrac': 0.0006578947650268674, 'actor/ppo_kl': -0.0007053136941976845}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.28246980905532837, 'actor/pg_clipfrac': 0.002016128972172737, 'actor/ppo_kl': -0.0012150657130405307}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.13624556362628937, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00040328572504222393}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.18811005353927612, 'actor/pg_clipfrac': 0.0013227512827143073, 'actor/ppo_kl': -0.0009582812199369073}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00022501515923067927, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001332081388682127}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.21544019877910614, 'actor/pg_clipfrac': 0.0011223345063626766, 'actor/ppo_kl': 0.0007687321631237864}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00014090632612351328, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -9.205647802446038e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.9244019985198975, 'actor/pg_clipfrac': 0.0013633265625685453, 'actor/ppo_kl': 0.0008800028008408844}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.29549428820610046, 'actor/pg_clipfrac': 0.0015873016091063619, 'actor/ppo_kl': -8.048708696151152e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1128939539194107, 'actor/pg_clipfrac': 0.0008460236713290215, 'actor/ppo_kl': -0.0004888250259682536}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00015181783237494528, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00021945094340480864}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.47284194827079773, 'actor/pg_clipfrac': 0.0030549897346645594, 'actor/ppo_kl': -0.0003135374281555414}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.4482961595058441, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013001036131754518}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.31306493282318115, 'actor/pg_clipfrac': 0.0015625000232830644, 'actor/ppo_kl': -0.00037150978459976614}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.27163565158843994, 'actor/pg_clipfrac': 0.0019379844889044762, 'actor/ppo_kl': -0.0020254047121852636}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.44773349165916443, 'actor/pg_clipfrac': 0.0006591957644559443, 'actor/ppo_kl': -0.000391448411392048}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.03212669864296913, 'actor/pg_clipfrac': 0.002515723230317235, 'actor/ppo_kl': 8.047332084970549e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.2689509689807892, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005750720156356692}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.14443925023078918, 'actor/pg_clipfrac': 0.0025575447361916304, 'actor/ppo_kl': 0.001298660528846085}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.14612030982971191, 'actor/pg_clipfrac': 0.002239641733467579, 'actor/ppo_kl': 0.0007882369100116193}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0001672846992732957, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008131384965963662}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.15723742544651031, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006535376305691898}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.40527257323265076, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003870046348311007}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.06445655971765518, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00023108534514904022}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.15543143451213837, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000743248441722244}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.442179799079895, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000312544172629714}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.9229850769042969, 'actor/pg_clipfrac': 0.0012484394246712327, 'actor/ppo_kl': -0.001639703055843711}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.17277581989765167, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006606476381421089}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00013444392243400216, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001405335497111082}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.6590705513954163, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00041767378570511937}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1063317060470581, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00019051630806643516}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:32:29, 4.35s/it, est. speed input: 108.89 toks/s, output: 22.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<47:13, 2.23s/it, est. speed input: 181.40 toks/s, output: 38.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<27:06, 1.29s/it, est. speed input: 259.53 toks/s, output: 55.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<18:08, 1.16it/s, est. speed input: 333.06 toks/s, output: 72.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<12:33, 1.67it/s, est. speed input: 406.03 toks/s, output: 93.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:28<09:08, 2.28it/s, est. speed input: 478.31 toks/s, output: 110.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:28<06:20, 3.28it/s, est. speed input: 557.10 toks/s, output: 129.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:28<04:30, 4.59it/s, est. speed input: 631.83 toks/s, output: 147.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:29<04:08, 4.96it/s, est. speed input: 694.98 toks/s, output: 164.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:29<01:33, 12.98it/s, est. speed input: 999.67 toks/s, output: 247.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:30<01:22, 14.57it/s, est. speed input: 1125.35 toks/s, output: 281.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:30<01:16, 15.69it/s, est. speed input: 1192.47 toks/s, output: 305.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:31<01:03, 18.61it/s, est. speed input: 1325.71 toks/s, output: 347.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:31<00:50, 23.40it/s, est. speed input: 1461.87 toks/s, output: 388.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:31<00:39, 29.97it/s, est. speed input: 1603.12 toks/s, output: 432.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:31<00:27, 42.63it/s, est. speed input: 1813.79 toks/s, output: 487.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:31<00:25, 44.52it/s, est. speed input: 1953.43 toks/s, output: 526.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:32<00:42, 26.71it/s, est. speed input: 2047.91 toks/s, output: 559.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:32<00:37, 29.86it/s, est. speed input: 2175.63 toks/s, output: 596.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:32<00:34, 32.70it/s, est. speed input: 2296.04 toks/s, output: 634.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:33<00:29, 37.46it/s, est. speed input: 2428.00 toks/s, output: 682.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:33<00:28, 38.36it/s, est. speed input: 2482.82 toks/s, output: 703.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:33<00:33, 33.16it/s, est. speed input: 2537.60 toks/s, output: 725.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:33<00:31, 35.16it/s, est. speed input: 2595.15 toks/s, output: 745.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:33<00:29, 37.00it/s, est. speed input: 2649.75 toks/s, output: 768.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:33<00:22, 48.69it/s, est. speed input: 2772.54 toks/s, output: 811.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:33<00:18, 58.65it/s, est. speed input: 2896.18 toks/s, output: 863.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:34<00:17, 59.36it/s, est. speed input: 3009.67 toks/s, output: 914.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:34<00:17, 60.64it/s, est. speed input: 3135.42 toks/s, output: 962.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:34<00:16, 62.05it/s, est. speed input: 3251.12 toks/s, output: 1005.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:34<00:22, 45.05it/s, est. speed input: 3344.07 toks/s, output: 1042.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:34<00:23, 43.40it/s, est. speed input: 3454.96 toks/s, output: 1092.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:35<00:20, 48.80it/s, est. speed input: 3564.95 toks/s, output: 1134.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:35<00:21, 46.20it/s, est. speed input: 3666.64 toks/s, output: 1172.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:35<00:20, 47.96it/s, est. speed input: 3772.93 toks/s, output: 1219.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:35<00:15, 61.58it/s, est. speed input: 3943.30 toks/s, output: 1283.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:35<00:12, 74.31it/s, est. speed input: 4127.96 toks/s, output: 1348.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:35<00:12, 75.49it/s, est. speed input: 4243.15 toks/s, output: 1393.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:36<00:09, 96.64it/s, est. speed input: 4476.10 toks/s, output: 1484.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:36<00:09, 94.58it/s, est. speed input: 4644.54 toks/s, output: 1557.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:36<00:10, 86.86it/s, est. speed input: 4801.18 toks/s, output: 1616.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:36<00:10, 85.81it/s, est. speed input: 4909.87 toks/s, output: 1668.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:36<00:10, 85.33it/s, est. speed input: 5022.09 toks/s, output: 1710.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:37<00:16, 52.98it/s, est. speed input: 5089.11 toks/s, output: 1743.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:37<00:15, 55.97it/s, est. speed input: 5197.26 toks/s, output: 1786.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:37<00:16, 51.70it/s, est. speed input: 5289.28 toks/s, output: 1816.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:37<00:10, 76.44it/s, est. speed input: 5512.43 toks/s, output: 1904.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:37<00:08, 94.09it/s, est. speed input: 5739.88 toks/s, output: 2012.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:37<00:08, 93.20it/s, est. speed input: 5891.41 toks/s, output: 2072.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:38<00:08, 93.12it/s, est. speed input: 6048.71 toks/s, output: 2163.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:38<00:07, 105.88it/s, est. speed input: 6324.51 toks/s, output: 2280.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:38<00:07, 101.62it/s, est. speed input: 6527.29 toks/s, output: 2376.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:38<00:09, 74.38it/s, est. speed input: 6639.91 toks/s, output: 2421.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:38<00:07, 91.22it/s, est. speed input: 6858.90 toks/s, output: 2494.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:39<00:06, 107.56it/s, est. speed input: 7100.86 toks/s, output: 2605.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:39<00:05, 113.09it/s, est. speed input: 7252.72 toks/s, output: 2670.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:39<00:05, 118.91it/s, est. speed input: 7405.80 toks/s, output: 2747.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:39<00:04, 131.27it/s, est. speed input: 7631.05 toks/s, output: 2853.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:39<00:04, 141.80it/s, est. speed input: 7848.29 toks/s, output: 2964.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:39<00:04, 123.57it/s, est. speed input: 8031.25 toks/s, output: 3065.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:39<00:04, 131.91it/s, est. speed input: 8237.00 toks/s, output: 3174.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:39<00:03, 144.79it/s, est. speed input: 8447.40 toks/s, output: 3279.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:40<00:03, 142.58it/s, est. speed input: 8651.11 toks/s, output: 3376.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:40<00:04, 125.95it/s, est. speed input: 8789.32 toks/s, output: 3437.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:40<00:03, 125.84it/s, est. speed input: 8928.28 toks/s, output: 3526.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:40<00:03, 147.44it/s, est. speed input: 9247.42 toks/s, output: 3691.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:40<00:04, 110.89it/s, est. speed input: 9360.27 toks/s, output: 3755.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:40<00:03, 116.02it/s, est. speed input: 9508.69 toks/s, output: 3849.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:40<00:02, 150.70it/s, est. speed input: 9849.58 toks/s, output: 4042.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:41<00:02, 144.70it/s, est. speed input: 10027.19 toks/s, output: 4151.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:41<00:03, 102.92it/s, est. speed input: 10172.68 toks/s, output: 4253.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:41<00:02, 128.79it/s, est. speed input: 10483.21 toks/s, output: 4444.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:41<00:01, 159.21it/s, est. speed input: 10922.03 toks/s, output: 4685.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:41<00:01, 188.00it/s, est. speed input: 11290.03 toks/s, output: 4911.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:42<00:01, 205.31it/s, est. speed input: 11626.49 toks/s, output: 5122.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:42<00:01, 167.14it/s, est. speed input: 11843.17 toks/s, output: 5274.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:42<00:01, 152.66it/s, est. speed input: 12030.59 toks/s, output: 5413.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:42<00:01, 121.29it/s, est. speed input: 12171.56 toks/s, output: 5526.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:42<00:01, 110.34it/s, est. speed input: 12324.84 toks/s, output: 5620.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:42<00:01, 116.11it/s, est. speed input: 12454.06 toks/s, output: 5738.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:43<00:00, 116.08it/s, est. speed input: 12665.70 toks/s, output: 5912.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:43<00:00, 122.31it/s, est. speed input: 12796.84 toks/s, output: 6013.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:43<00:00, 121.67it/s, est. speed input: 12935.66 toks/s, output: 6109.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:44<00:01, 48.11it/s, est. speed input: 12850.41 toks/s, output: 6111.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:44<00:00, 53.81it/s, est. speed input: 12987.28 toks/s, output: 6245.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:44<00:00, 59.88it/s, est. speed input: 13093.50 toks/s, output: 6348.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:44<00:00, 56.87it/s, est. speed input: 13133.11 toks/s, output: 6403.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:45<00:00, 30.57it/s, est. speed input: 12991.92 toks/s, output: 6393.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:46<00:00, 27.44it/s, est. speed input: 12799.36 toks/s, output: 6331.70 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.02900063246488571, 'actor/pg_clipfrac': 0.002400000113993883, 'actor/ppo_kl': -0.0006085784989409149}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00010605602437863126, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -6.680063233943656e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.11574314534664154, 'actor/pg_clipfrac': 0.0009115770226344466, 'actor/ppo_kl': 0.001979890512302518}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.3834913671016693, 'actor/pg_clipfrac': 0.002006018068641424, 'actor/ppo_kl': 7.98867258708924e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00021295319311320782, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0019262246787548065}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00017238488362636417, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00015821946726646274}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00021491380175575614, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007651696796528995}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00018241218640469015, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000354936346411705}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 7.998994260560721e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006154181901365519}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00013402168406173587, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018127828370779753}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.7291628122329712, 'actor/pg_clipfrac': 0.0007369196973741055, 'actor/ppo_kl': 0.00031917940941639245}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 1.253828763961792, 'actor/pg_clipfrac': 0.0005853087641298771, 'actor/ppo_kl': 0.0004374906129669398}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00015956723655108362, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005228408263064921}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.06434108316898346, 'actor/pg_clipfrac': 0.0013351135421544313, 'actor/ppo_kl': 0.001173179829493165}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00012310068996157497, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012070820666849613}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00011590590293053538, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006789279286749661}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 9.970374958356842e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00026385197998024523}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.600733757019043, 'actor/pg_clipfrac': 0.0013157895300537348, 'actor/ppo_kl': 0.000698350602760911}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00019594680634327233, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008756234892643988}
[36m(Runner pid=3309020)[0m Step 9
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.31
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.017
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.013
[36m(Runner pid=3309020)[0m ppo_kl: -8.530207742722951e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.024
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.024
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.615
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.615
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 908570
[36m(Runner pid=3309020)[0m balanced_min: 907222
[36m(Runner pid=3309020)[0m max: 913231
[36m(Runner pid=3309020)[0m mean: 907896.0
[36m(Runner pid=3309020)[0m min: 902561
[36m(Runner pid=3309020)[0m minmax_diff: 10670
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.729
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 73.999
[36m(Runner pid=3309020)[0m mfu_actor: 0.112
[36m(Runner pid=3309020)[0m throughput: 1061.971
[36m(Runner pid=3309020)[0m time_per_step: 854.916
[36m(Runner pid=3309020)[0m total_num_tokens: 1815792
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 634.0
[36m(Runner pid=3309020)[0m mean: 467.131
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3180.0
[36m(Runner pid=3309020)[0m mean: 242.163
[36m(Runner pid=3309020)[0m min: 38.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.242
[36m(Runner pid=3309020)[0m format: 0.983
[36m(Runner pid=3309020)[0m overall: 0.615
[36m(Runner pid=3309020)[0m tag_reward: 0.994
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.413330709800464e-05
[36m(Runner pid=3309020)[0m gen: 0.183
[36m(Runner pid=3309020)[0m old: 0.047
[36m(Runner pid=3309020)[0m ref: 0.048
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.309
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.135
[36m(Runner pid=3309020)[0m gen: 113.324
[36m(Runner pid=3309020)[0m old: 85.761
[36m(Runner pid=3309020)[0m ref: 88.0
[36m(Runner pid=3309020)[0m reward: 6.497
[36m(Runner pid=3309020)[0m step: 854.916
[36m(Runner pid=3309020)[0m update_actor: 560.667
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 10; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.57 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:45:19 [executor_base.py:219] It took 0.342891 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.49 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.77 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:46:38 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:45:19 [executor_base.py:219] It took 0.340096 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:46:38 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.86 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:46:38 [executor_base.py:208] It took 0.326062 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.86 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:46:40 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:46:41 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:46:41 [executor_base.py:208] It took 0.328708 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.35982847213745117, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.1308887004852295, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.003535911440849304}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2902756929397583, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004240732523612678}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0813266783952713, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.41771888732910156, 'actor/pg_clipfrac': 0.0014347202377393842, 'actor/ppo_kl': 0.00085798668442294}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.03384464606642723, 'actor/pg_clipfrac': 0.0018416206585243344, 'actor/ppo_kl': -0.0005038983072154224}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.025489158928394318, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2562349736690521, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.08503209054470062, 'actor/pg_clipfrac': 0.0013140604132786393, 'actor/ppo_kl': 0.0022581203375011683}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.2811269760131836, 'actor/pg_clipfrac': 0.001782531151548028, 'actor/ppo_kl': 0.002613499527797103}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.04305647686123848, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.16047456860542297, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0253316517919302, 'actor/pg_clipfrac': 0.0008680555620230734, 'actor/ppo_kl': -0.00024047990154940635}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.15876564383506775, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001519685611128807}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.4311876595020294, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.1863221526145935, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002051971823675558, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013264442095533013}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.10986167192459106, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008276806329376996}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.4189230799674988, 'actor/pg_clipfrac': 0.0017241379246115685, 'actor/ppo_kl': 0.002109859837219119}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.07411658763885498, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00011023798288078979}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.42376941442489624, 'actor/pg_clipfrac': 0.003976142965257168, 'actor/ppo_kl': -0.004607710521668196}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.5139050483703613, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006842004950158298}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2568436861038208, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002498915186151862}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.17526055872440338, 'actor/pg_clipfrac': 0.004950494971126318, 'actor/ppo_kl': -0.0007417320157401264}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.29148685932159424, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008926753653213382}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.3736709952354431, 'actor/pg_clipfrac': 0.001287001301534474, 'actor/ppo_kl': 0.001270392327569425}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3183039128780365, 'actor/pg_clipfrac': 0.0007530120201408863, 'actor/ppo_kl': -0.001061303075402975}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.28614455461502075, 'actor/pg_clipfrac': 0.0035118525847792625, 'actor/ppo_kl': 0.00017297927115578204}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.009640080854296684, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014287381200119853}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.06631740182638168, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006259033107198775}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.13750623166561127, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002862798748537898}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.306072473526001, 'actor/pg_clipfrac': 0.001349527621641755, 'actor/ppo_kl': -4.295009421184659e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.6540120244026184, 'actor/pg_clipfrac': 0.002559727057814598, 'actor/ppo_kl': 0.0015155316796153784}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0001203210194944404, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006161030032671988}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.03715948387980461, 'actor/pg_clipfrac': 0.001917545567266643, 'actor/ppo_kl': 0.0016830660169944167}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.3148311376571655, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007578711956739426}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3884356617927551, 'actor/pg_clipfrac': 0.00325203244574368, 'actor/ppo_kl': 0.0027581099420785904}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.06747099757194519, 'actor/pg_clipfrac': 0.0010183299891650677, 'actor/ppo_kl': -0.00020947815210092813}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00016306521138176322, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00044841927592642605}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:34:40, 15.11s/it, est. speed input: 29.39 toks/s, output: 4.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<39:15, 6.28s/it, est. speed input: 61.08 toks/s, output: 9.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%|▏ | 5/377 [00:15<11:21, 1.83s/it, est. speed input: 148.11 toks/s, output: 24.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 7/377 [00:15<06:52, 1.12s/it, est. speed input: 207.70 toks/s, output: 33.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 11/377 [00:15<03:15, 1.87it/s, est. speed input: 323.48 toks/s, output: 56.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 14/377 [00:15<02:10, 2.78it/s, est. speed input: 406.54 toks/s, output: 73.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 17/377 [00:15<01:31, 3.93it/s, est. speed input: 489.43 toks/s, output: 90.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 21/377 [00:16<00:59, 5.99it/s, est. speed input: 597.29 toks/s, output: 115.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 24/377 [00:16<00:48, 7.33it/s, est. speed input: 676.81 toks/s, output: 133.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 30/377 [00:16<00:28, 11.99it/s, est. speed input: 842.17 toks/s, output: 172.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 37/377 [00:16<00:18, 18.16it/s, est. speed input: 1030.70 toks/s, output: 217.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 41/377 [00:16<00:16, 20.52it/s, est. speed input: 1133.94 toks/s, output: 245.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 47/377 [00:16<00:12, 25.92it/s, est. speed input: 1288.04 toks/s, output: 286.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 53/377 [00:16<00:10, 30.85it/s, est. speed input: 1441.87 toks/s, output: 329.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 59/377 [00:17<00:10, 31.63it/s, est. speed input: 1586.18 toks/s, output: 372.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 66/377 [00:17<00:08, 35.78it/s, est. speed input: 1757.03 toks/s, output: 423.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 71/377 [00:17<00:08, 37.18it/s, est. speed input: 1877.36 toks/s, output: 460.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 76/377 [00:17<00:09, 32.33it/s, est. speed input: 2000.58 toks/s, output: 497.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 82/377 [00:17<00:08, 36.63it/s, est. speed input: 2144.69 toks/s, output: 545.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 87/377 [00:17<00:07, 38.05it/s, est. speed input: 2261.15 toks/s, output: 585.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 95/377 [00:17<00:06, 46.02it/s, est. speed input: 2455.84 toks/s, output: 650.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 102/377 [00:18<00:05, 50.25it/s, est. speed input: 2621.65 toks/s, output: 710.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 108/377 [00:18<00:05, 51.09it/s, est. speed input: 2759.32 toks/s, output: 762.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 118/377 [00:18<00:04, 61.99it/s, est. speed input: 2998.61 toks/s, output: 851.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 125/377 [00:18<00:03, 63.50it/s, est. speed input: 3161.18 toks/s, output: 911.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 136/377 [00:18<00:03, 72.35it/s, est. speed input: 3416.04 toks/s, output: 1012.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 144/377 [00:18<00:03, 71.11it/s, est. speed input: 3594.64 toks/s, output: 1084.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 152/377 [00:18<00:03, 66.72it/s, est. speed input: 3769.43 toks/s, output: 1157.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 163/377 [00:18<00:02, 75.33it/s, est. speed input: 4016.89 toks/s, output: 1263.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 171/377 [00:19<00:03, 59.02it/s, est. speed input: 4169.05 toks/s, output: 1334.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 178/377 [00:19<00:03, 60.43it/s, est. speed input: 4314.68 toks/s, output: 1404.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 187/377 [00:19<00:02, 66.63it/s, est. speed input: 4509.80 toks/s, output: 1496.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 195/377 [00:19<00:03, 55.82it/s, est. speed input: 4657.99 toks/s, output: 1571.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 202/377 [00:19<00:03, 52.03it/s, est. speed input: 4787.40 toks/s, output: 1640.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 208/377 [00:19<00:03, 51.69it/s, est. speed input: 4903.23 toks/s, output: 1702.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 217/377 [00:19<00:02, 58.24it/s, est. speed input: 5090.51 toks/s, output: 1801.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 224/377 [00:20<00:02, 58.68it/s, est. speed input: 5227.28 toks/s, output: 1879.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 233/377 [00:20<00:02, 64.50it/s, est. speed input: 5414.69 toks/s, output: 1983.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 243/377 [00:20<00:01, 68.47it/s, est. speed input: 5613.19 toks/s, output: 2100.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 250/377 [00:20<00:01, 67.81it/s, est. speed input: 5742.17 toks/s, output: 2183.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▊ | 259/377 [00:20<00:01, 72.02it/s, est. speed input: 5921.50 toks/s, output: 2292.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 267/377 [00:20<00:01, 68.12it/s, est. speed input: 6072.60 toks/s, output: 2387.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▎ | 278/377 [00:20<00:01, 76.69it/s, est. speed input: 6300.79 toks/s, output: 2528.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 286/377 [00:20<00:01, 76.13it/s, est. speed input: 6453.93 toks/s, output: 2629.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 294/377 [00:21<00:01, 63.02it/s, est. speed input: 6583.51 toks/s, output: 2724.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 301/377 [00:21<00:01, 62.03it/s, est. speed input: 6706.15 toks/s, output: 2816.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 308/377 [00:21<00:01, 55.12it/s, est. speed input: 6816.86 toks/s, output: 2905.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 314/377 [00:21<00:01, 54.82it/s, est. speed input: 6918.96 toks/s, output: 2987.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 321/377 [00:21<00:01, 54.60it/s, est. speed input: 7034.66 toks/s, output: 3083.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 327/377 [00:21<00:01, 47.18it/s, est. speed input: 7111.79 toks/s, output: 3161.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 332/377 [00:21<00:01, 40.45it/s, est. speed input: 7163.58 toks/s, output: 3222.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 337/377 [00:21<00:00, 42.13it/s, est. speed input: 7237.30 toks/s, output: 3297.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 342/377 [00:22<00:01, 33.39it/s, est. speed input: 7266.38 toks/s, output: 3356.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 346/377 [00:22<00:00, 34.13it/s, est. speed input: 7316.68 toks/s, output: 3417.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 350/377 [00:22<00:00, 32.87it/s, est. speed input: 7357.04 toks/s, output: 3477.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 355/377 [00:22<00:00, 27.39it/s, est. speed input: 7380.36 toks/s, output: 3541.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 359/377 [00:22<00:00, 26.17it/s, est. speed input: 7407.75 toks/s, output: 3600.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 362/377 [00:23<00:00, 22.13it/s, est. speed input: 7403.33 toks/s, output: 3634.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [00:23<00:00, 22.15it/s, est. speed input: 7425.49 toks/s, output: 3697.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 369/377 [00:23<00:00, 18.19it/s, est. speed input: 7406.95 toks/s, output: 3727.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:23<00:00, 16.15it/s, est. speed input: 7391.55 toks/s, output: 3747.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:24<00:00, 8.53it/s, est. speed input: 7233.80 toks/s, output: 3700.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:39<00:00, 8.53it/s, est. speed input: 7233.80 toks/s, output: 3700.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [01:00<00:14, 4.76s/it, est. speed input: 2940.95 toks/s, output: 1586.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 375/377 [01:02<00:08, 4.46s/it, est. speed input: 2814.99 toks/s, output: 1601.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 377/377 [01:03<00:00, 5.98it/s, est. speed input: 2825.38 toks/s, output: 1773.44 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.33804944157600403, 'actor/pg_clipfrac': 0.0013227512827143073, 'actor/ppo_kl': 0.00019066674576606601}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.7555254101753235, 'actor/pg_clipfrac': 0.001742160296998918, 'actor/ppo_kl': -0.00032855907920747995}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.2649880349636078, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003303957055322826}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3073526918888092, 'actor/pg_clipfrac': 0.0011402508243918419, 'actor/ppo_kl': 0.0010537677444517612}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.23521031439304352, 'actor/pg_clipfrac': 0.0025575447361916304, 'actor/ppo_kl': 0.0007096434710547328}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.4103509783744812, 'actor/pg_clipfrac': 0.0012853470398113132, 'actor/ppo_kl': 0.002036013873293996}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2923220694065094, 'actor/pg_clipfrac': 0.0019417476141825318, 'actor/ppo_kl': 0.0017985187005251646}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.306505024433136, 'actor/pg_clipfrac': 0.0027816412039101124, 'actor/ppo_kl': -0.00046415688120760024}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.13691942393779755, 'actor/pg_clipfrac': 0.0010362694738432765, 'actor/ppo_kl': 0.0013330726651474833}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.2207706719636917, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009618425974622369}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00011790913413278759, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011982950381934643}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0010180866811424494, 'actor/pg_clipfrac': 0.0006006006151437759, 'actor/ppo_kl': -0.00030938797863200307}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00010899524932028726, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00012770821922458708}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.23292557895183563, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0026109565515071154}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.6125861406326294, 'actor/pg_clipfrac': 0.002624671906232834, 'actor/ppo_kl': -0.0006530651589855552}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.02531556226313114, 'actor/pg_clipfrac': 0.0010822510812431574, 'actor/ppo_kl': -0.00026815381716005504}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.1622573584318161, 'actor/pg_clipfrac': 0.001179245300590992, 'actor/ppo_kl': -0.0007312477682717144}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.48615726828575134, 'actor/pg_clipfrac': 0.001287001301534474, 'actor/ppo_kl': 6.983915227465332e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00020765048975590616, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010879167821258307}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3391667604446411, 'actor/pg_clipfrac': 0.0021739129442721605, 'actor/ppo_kl': 0.0016021686606109142}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0001929688296513632, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005278762546367943}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.13192766904830933, 'actor/pg_clipfrac': 0.0009689922444522381, 'actor/ppo_kl': 0.00037150050047785044}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.057754650712013245, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011325442465022206}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00018872109649237245, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0020453352481126785}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.16544313728809357, 'actor/pg_clipfrac': 0.003846153849735856, 'actor/ppo_kl': -0.001956281950697303}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.69 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-07 23:59:44 [executor_base.py:219] It took 0.338760 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.61 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:01:35 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-07 23:59:44 [executor_base.py:219] It took 0.337495 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:01:35 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:01:35 [executor_base.py:208] It took 0.327823 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:01:35 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:01:35 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:01:35 [executor_base.py:208] It took 0.328943 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to consider the properties of the triangle and the given conditions. Since point D is the midpoint of AB, AD = DB = 2.0. The perimeter of triangle ABC is 16.0, so AB + BC + AC = 16.0. Given that BD = 2.0, we can deduce that AD = 2.0 as well, making AB = 4.0. The perimeter of triangle ABC includes AB, AC, and BC, so AC + BC = 12.0. Point E is on the vertical line l passing through D, which is the perpendicular bisector of AB. This means that AE = BE. The perimeter of triangle AEC is AC + AE + EC. Since AE = BE, the perimeter can be simplified to AC + BE + EC. To minimize this perimeter, we need to minimize the sum of AC and BE, and maximize EC. The maximum value of EC is when E is at point C, making EC = AC. The minimum value of AC + BE is when A, E, and B are collinear, which means AC = BE. Therefore, the minimum perimeter of triangle AEC is AC + AC = 2 * AC. Since AC + BC = 12.0, the minimum perimeter of triangle AEC is 2 * (12.0 / 2) = 12.0.\n\n\nThe minimum perimeter of triangle AEC is 12.0.\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nTo solve this problem, we need to understand the geometric transformations and angles involved. Since triangle ABC is rotated clockwise around point A by 90° to form triangle AB'C', we know that triangle AB'C' is congruent to triangle ABC. The rotation preserves distances and angles, so angle BAC is equal to angle B'A'C' = 90°. Furthermore, since B'A' is the image of B, angle AB'A' is also 90°. \n\nThe key information given is angle CC'B' = 32°. Since B'C' is the image of BC, angle B'CC' is equal to angle B'CB = 90° - angle CC'B' = 90° - 32° = 58°. Now, since triangle AB'C' is a right triangle (because angle BAC is 90° and angle AB'A' is also 90°), angle AC'B' = 90° - angle B'CC' = 90° - 58° = 32°. \n\nTherefore, angle AC'B' is 32°.\n\n\nThe size of angle AC'B' is 32.0.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nIn a circle, if a triangle is inscribed with one of its vertices on the circle's diameter, the angle opposite to that vertex is a right angle. This is a property known as the Inscribed Angle Theorem. Since BE is the diameter of circle O, angle AEB is a right angle (90 degrees).\n\nAdditionally, in a parallelogram, opposite angles are equal. Since angle ADC = 54 degrees, and AD and BC are opposite sides of the parallelogram, angle ABC is also 54 degrees.\n\nNow, considering triangle ABE, since angle AEB is a right angle, and angle ABC is 54 degrees, we can find angle BAE by subtracting these two angles from 180 degrees (the sum of angles in a triangle):\nangle BAE = 180 degrees - angle AEB - angle ABC\nangle BAE = 180 degrees - 90 degrees - 54 degrees\nangle BAE = 36 degrees\n\nTherefore, angle AEB is 90 degrees.\n\n\nThe degree of angle AEB is 90 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AB is the diameter of circle O, point C is on the extended line of AB, and CD is tangent to circle O at point D, we can use the properties of circles and tangents to solve for angle C.\n\nSince AB is the diameter, angle ADB is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle.\n\nGiven angle A = 35 degrees, we can find angle ADB by subtracting angle A from 90 degrees, because angle ADB is the complement of angle A in the right triangle ADB.\n\nSo, angle ADB = 90 - 35 = 55 degrees.\n\nSince CD is a tangent line to the circle at point D, angle ODC is also a right angle (90 degrees) because a tangent to a circle is perpendicular to the radius at the point of tangency.\n\nNow, angle ODC is the sum of angle C and angle ADB, because they are adjacent angles on a straight line.\n\nSo, angle C = angle ODC - angle ADB = 90 - 55 = 35 degrees.\n\nTherefore, angle C = 35 degrees.\n\n\n35 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of the circle O, angle AOD is a right angle (90 degrees) because any angle subtended by a diameter in a semicircle is a right angle. Since EA is a tangent to the circle, angle OAE is also a right angle (90 degrees). Therefore, angle EAO is the remaining angle in the right triangle AOD, which is 90 degrees - 120 degrees = 30 degrees. Angle BAC is half of angle EAO because angle BAC is an inscribed angle that subtends the same arc as angle EAO. Therefore, angle BAC = 30 degrees / 2 = 15 degrees. Since angle ABC is an inscribed angle that subtends the same arc as angle BAC, it is half of angle BAC. Therefore, angle ABC = 15 degrees / 2 = 7.5 degrees.\n\n\nThe degree of angle ABC is 7.5 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:32:53, 4.37s/it, est. speed input: 101.58 toks/s, output: 23.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:22<40:00, 1.89s/it, est. speed input: 195.82 toks/s, output: 45.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:23<24:02, 1.14s/it, est. speed input: 285.60 toks/s, output: 62.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:25<16:35, 1.27it/s, est. speed input: 359.36 toks/s, output: 81.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:26<12:27, 1.68it/s, est. speed input: 429.65 toks/s, output: 100.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:26<05:08, 4.01it/s, est. speed input: 679.65 toks/s, output: 160.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:26<04:10, 4.94it/s, est. speed input: 760.68 toks/s, output: 178.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:28<03:22, 6.02it/s, est. speed input: 943.06 toks/s, output: 220.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:29<02:58, 6.82it/s, est. speed input: 1012.41 toks/s, output: 238.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:29<02:27, 8.20it/s, est. speed input: 1083.36 toks/s, output: 261.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:30<02:39, 7.55it/s, est. speed input: 1122.98 toks/s, output: 273.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:30<01:56, 10.28it/s, est. speed input: 1256.85 toks/s, output: 308.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:30<01:49, 10.90it/s, est. speed input: 1318.49 toks/s, output: 330.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:31<01:38, 12.01it/s, est. speed input: 1381.51 toks/s, output: 348.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:31<01:21, 14.41it/s, est. speed input: 1445.14 toks/s, output: 370.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:31<00:57, 20.45it/s, est. speed input: 1580.53 toks/s, output: 415.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:31<00:54, 21.20it/s, est. speed input: 1642.78 toks/s, output: 438.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:32<00:57, 20.34it/s, est. speed input: 1701.38 toks/s, output: 456.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:32<00:50, 23.07it/s, est. speed input: 1765.74 toks/s, output: 478.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:32<00:38, 29.97it/s, est. speed input: 1898.56 toks/s, output: 519.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:32<00:43, 26.28it/s, est. speed input: 1952.00 toks/s, output: 529.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:32<00:43, 26.04it/s, est. speed input: 2012.14 toks/s, output: 550.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:33<00:47, 23.75it/s, est. speed input: 2067.83 toks/s, output: 565.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:33<00:57, 19.57it/s, est. speed input: 2166.98 toks/s, output: 601.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:33<00:30, 35.88it/s, est. speed input: 2413.86 toks/s, output: 680.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:34<00:29, 36.73it/s, est. speed input: 2474.00 toks/s, output: 703.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:34<00:45, 23.84it/s, est. speed input: 2501.75 toks/s, output: 714.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:34<00:43, 24.84it/s, est. speed input: 2555.89 toks/s, output: 726.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:34<00:31, 34.07it/s, est. speed input: 2676.36 toks/s, output: 775.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:35<00:29, 35.85it/s, est. speed input: 2731.56 toks/s, output: 797.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:35<00:27, 38.33it/s, est. speed input: 2847.65 toks/s, output: 840.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:35<00:20, 51.49it/s, est. speed input: 3019.03 toks/s, output: 911.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:35<00:17, 58.96it/s, est. speed input: 3140.36 toks/s, output: 960.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:35<00:18, 54.16it/s, est. speed input: 3256.68 toks/s, output: 1001.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:36<00:20, 48.44it/s, est. speed input: 3356.81 toks/s, output: 1041.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:36<00:19, 52.54it/s, est. speed input: 3468.85 toks/s, output: 1084.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:36<00:22, 44.71it/s, est. speed input: 3562.60 toks/s, output: 1120.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:36<00:18, 53.49it/s, est. speed input: 3679.47 toks/s, output: 1167.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:36<00:17, 56.93it/s, est. speed input: 3788.01 toks/s, output: 1213.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:36<00:12, 77.62it/s, est. speed input: 4021.88 toks/s, output: 1301.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:37<00:21, 43.63it/s, est. speed input: 4086.26 toks/s, output: 1334.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:37<00:15, 61.03it/s, est. speed input: 4318.90 toks/s, output: 1426.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:37<00:14, 63.03it/s, est. speed input: 4424.12 toks/s, output: 1471.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:37<00:15, 57.29it/s, est. speed input: 4519.66 toks/s, output: 1517.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:38<00:13, 65.00it/s, est. speed input: 4677.30 toks/s, output: 1583.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:38<00:19, 45.52it/s, est. speed input: 4748.23 toks/s, output: 1627.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:38<00:14, 57.93it/s, est. speed input: 4918.88 toks/s, output: 1703.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:38<00:15, 55.41it/s, est. speed input: 5012.36 toks/s, output: 1751.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:39<00:18, 46.02it/s, est. speed input: 5083.50 toks/s, output: 1784.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:39<00:15, 52.46it/s, est. speed input: 5181.24 toks/s, output: 1827.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:39<00:15, 52.02it/s, est. speed input: 5275.41 toks/s, output: 1876.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:39<00:13, 58.80it/s, est. speed input: 5423.20 toks/s, output: 1910.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:39<00:19, 42.30it/s, est. speed input: 5485.27 toks/s, output: 1943.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:40<00:18, 44.09it/s, est. speed input: 5570.06 toks/s, output: 1986.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:40<00:17, 43.90it/s, est. speed input: 5654.38 toks/s, output: 2032.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:40<00:16, 48.35it/s, est. speed input: 5749.69 toks/s, output: 2088.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:40<00:15, 49.93it/s, est. speed input: 5835.24 toks/s, output: 2140.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:40<00:08, 90.10it/s, est. speed input: 6160.15 toks/s, output: 2313.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:41<00:08, 83.96it/s, est. speed input: 6295.99 toks/s, output: 2381.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:41<00:08, 84.79it/s, est. speed input: 6432.27 toks/s, output: 2462.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:41<00:08, 85.46it/s, est. speed input: 6518.64 toks/s, output: 2517.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:41<00:06, 106.32it/s, est. speed input: 6721.40 toks/s, output: 2619.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:41<00:07, 82.80it/s, est. speed input: 6847.78 toks/s, output: 2683.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:41<00:06, 100.65it/s, est. speed input: 7056.64 toks/s, output: 2799.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:42<00:05, 111.11it/s, est. speed input: 7262.40 toks/s, output: 2920.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:42<00:05, 115.17it/s, est. speed input: 7403.78 toks/s, output: 3000.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:42<00:04, 118.63it/s, est. speed input: 7597.49 toks/s, output: 3128.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:42<00:03, 144.11it/s, est. speed input: 7848.16 toks/s, output: 3261.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:42<00:03, 140.24it/s, est. speed input: 8037.61 toks/s, output: 3382.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:42<00:03, 162.68it/s, est. speed input: 8280.54 toks/s, output: 3518.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:42<00:02, 182.61it/s, est. speed input: 8630.13 toks/s, output: 3697.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:42<00:02, 186.33it/s, est. speed input: 8829.47 toks/s, output: 3815.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:43<00:02, 181.63it/s, est. speed input: 9031.16 toks/s, output: 3931.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:43<00:03, 133.75it/s, est. speed input: 9189.33 toks/s, output: 4036.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:43<00:03, 101.55it/s, est. speed input: 9335.72 toks/s, output: 4116.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:43<00:03, 109.18it/s, est. speed input: 9468.15 toks/s, output: 4215.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:43<00:03, 105.53it/s, est. speed input: 9587.96 toks/s, output: 4299.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:44<00:02, 120.05it/s, est. speed input: 9817.87 toks/s, output: 4446.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:44<00:02, 114.27it/s, est. speed input: 9947.31 toks/s, output: 4533.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:44<00:02, 123.53it/s, est. speed input: 10123.31 toks/s, output: 4648.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:44<00:01, 148.52it/s, est. speed input: 10352.45 toks/s, output: 4794.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:44<00:01, 138.08it/s, est. speed input: 10529.07 toks/s, output: 4951.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:44<00:01, 133.15it/s, est. speed input: 10698.75 toks/s, output: 5070.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:44<00:01, 141.15it/s, est. speed input: 10885.32 toks/s, output: 5207.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:45<00:01, 141.32it/s, est. speed input: 11026.17 toks/s, output: 5310.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:45<00:01, 118.64it/s, est. speed input: 11184.93 toks/s, output: 5464.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:45<00:01, 133.77it/s, est. speed input: 11366.74 toks/s, output: 5604.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:45<00:01, 136.21it/s, est. speed input: 11495.52 toks/s, output: 5716.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:45<00:01, 127.16it/s, est. speed input: 11689.83 toks/s, output: 5885.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:45<00:01, 91.15it/s, est. speed input: 11770.86 toks/s, output: 5966.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:46<00:01, 81.93it/s, est. speed input: 11860.99 toks/s, output: 6077.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:46<00:00, 85.43it/s, est. speed input: 11981.87 toks/s, output: 6221.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:46<00:00, 77.17it/s, est. speed input: 12037.10 toks/s, output: 6269.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:46<00:00, 76.58it/s, est. speed input: 12166.57 toks/s, output: 6417.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:46<00:00, 73.19it/s, est. speed input: 12226.78 toks/s, output: 6485.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:47<00:00, 76.91it/s, est. speed input: 12297.46 toks/s, output: 6571.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:47<00:00, 58.57it/s, est. speed input: 12316.17 toks/s, output: 6616.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:48<00:00, 27.89it/s, est. speed input: 12191.22 toks/s, output: 6638.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:50<00:00, 14.18it/s, est. speed input: 11861.69 toks/s, output: 6491.48 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:50<00:00, 25.50it/s, est. speed input: 11861.69 toks/s, output: 6491.48 toks/s]
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_10/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_10/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_10/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m Step 10
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.291
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.014
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: 0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.601
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.601
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 901513
[36m(Runner pid=3309020)[0m balanced_min: 901512
[36m(Runner pid=3309020)[0m max: 910770
[36m(Runner pid=3309020)[0m mean: 901512.5
[36m(Runner pid=3309020)[0m min: 892255
[36m(Runner pid=3309020)[0m minmax_diff: 18515
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 104.21
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.111
[36m(Runner pid=3309020)[0m throughput: 878.091
[36m(Runner pid=3309020)[0m time_per_step: 1026.673
[36m(Runner pid=3309020)[0m total_num_tokens: 1803025
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 807.0
[36m(Runner pid=3309020)[0m mean: 464.988
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1087.0
[36m(Runner pid=3309020)[0m mean: 239.318
[36m(Runner pid=3309020)[0m min: 48.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.211
[36m(Runner pid=3309020)[0m format: 0.988
[36m(Runner pid=3309020)[0m overall: 0.601
[36m(Runner pid=3309020)[0m tag_reward: 0.996
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.558566409427293e-05
[36m(Runner pid=3309020)[0m gen: 0.158
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.048
[36m(Runner pid=3309020)[0m reward: 0.011
[36m(Runner pid=3309020)[0m update_actor: 0.311
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.154
[36m(Runner pid=3309020)[0m gen: 97.07
[36m(Runner pid=3309020)[0m old: 83.137
[36m(Runner pid=3309020)[0m ref: 86.299
[36m(Runner pid=3309020)[0m reward: 6.564
[36m(Runner pid=3309020)[0m save_checkpoint: 30.466
[36m(Runner pid=3309020)[0m step: 1026.673
[36m(Runner pid=3309020)[0m update_actor: 560.188
[36m(Runner pid=3309020)[0m validation: 162.187
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.296
[36m(Runner pid=3309020)[0m format_reward: 0.985
[36m(Runner pid=3309020)[0m overall_reward: 0.641
[36m(Runner pid=3309020)[0m reward_score: 0.641
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.989
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_10/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_10/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_10/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 11; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:02:29 [executor_base.py:219] It took 0.339312 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:03:54 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:02:29 [executor_base.py:219] It took 0.337850 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:03:54 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:03:54 [executor_base.py:208] It took 0.327673 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.82 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:03:55 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:03:55 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:03:55 [executor_base.py:208] It took 0.325926 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0003067194193135947, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005240042228251696}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5593259334564209, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005952041829004884}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.4474392533302307, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006933514378033578}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0001528692082501948, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.3637060523033142, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.07227759808301926, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001233228831551969}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.838985800743103, 'actor/pg_clipfrac': 0.006211180239915848, 'actor/ppo_kl': 0.0014315737644210458}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.1839876025915146, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0001380562171107158, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.13160356879234314, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00022372919193003327}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.06164327263832092, 'actor/pg_clipfrac': 0.000641436839941889, 'actor/ppo_kl': -0.0012542805634438992}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.4473378360271454, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.11680693924427032, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006144000217318535}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003520172322168946, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010015120496973395}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.1622346192598343, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.18763002753257751, 'actor/pg_clipfrac': 0.0017301038606092334, 'actor/ppo_kl': -0.0032254387624561787}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.11878514289855957, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 3.708406075020321e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.09718377143144608, 'actor/pg_clipfrac': 0.0012210012646391988, 'actor/ppo_kl': 0.00041320297168567777}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.3458077013492584, 'actor/pg_clipfrac': 0.0007547169807367027, 'actor/ppo_kl': 0.00015127650112845004}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00016916617460083216, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008903050911612809}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002956548414658755, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009560916223563254}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.36037328839302063, 'actor/pg_clipfrac': 0.0011135857785120606, 'actor/ppo_kl': 0.0020696294959634542}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0698002353310585, 'actor/pg_clipfrac': 0.0016949152341112494, 'actor/ppo_kl': -0.0015051906229928136}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.08316605538129807, 'actor/pg_clipfrac': 0.0013586956774815917, 'actor/ppo_kl': -0.00030187421361915767}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.21644948422908783, 'actor/pg_clipfrac': 0.0005580357392318547, 'actor/ppo_kl': 0.00015497846470680088}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00019435331341810524, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011362741934135556}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.04537532851099968, 'actor/pg_clipfrac': 0.003015075344592333, 'actor/ppo_kl': -0.0006231394363567233}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.000147595361340791, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00018064409960061312}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00012526304635684937, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00046213806490413845}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.6047374606132507, 'actor/pg_clipfrac': 0.0018604651559144258, 'actor/ppo_kl': 0.00041929734288714826}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.17593809962272644, 'actor/pg_clipfrac': 0.0009216589969582856, 'actor/ppo_kl': 8.939664257923141e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.080123171210289, 'actor/pg_clipfrac': 0.0009832842042669654, 'actor/ppo_kl': 0.000785323150921613}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.1272508203983307, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006462059682235122}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.341487318277359, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002448839135468006}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.1963108479976654, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006691243033856153}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.25274965167045593, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001692386285867542}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5716392993927002, 'actor/pg_clipfrac': 0.0010822510812431574, 'actor/ppo_kl': -0.0005206669447943568}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.17362187802791595, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013541619991883636}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.06527011096477509, 'actor/pg_clipfrac': 0.0009606147650629282, 'actor/ppo_kl': -0.0009761594119481742}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.1546924114227295, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007480985950678587}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00013438711175695062, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011029718443751335}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.16546104848384857, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0026032323949038982}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.25606629252433777, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000726597907487303}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.280746191740036, 'actor/pg_clipfrac': 0.002436053706333041, 'actor/ppo_kl': -0.0010312460362911224}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0003075539425481111, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0023080818355083466}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00029177419492043555, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000911729468498379}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3160865306854248, 'actor/pg_clipfrac': 0.0015479875728487968, 'actor/ppo_kl': -0.0006744751008227468}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.048464369028806686, 'actor/pg_clipfrac': 0.00148148147854954, 'actor/ppo_kl': -0.001304513425566256}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.25017762184143066, 'actor/pg_clipfrac': 0.0010010009864345193, 'actor/ppo_kl': -0.001323510892689228}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:35:49, 4.51s/it, est. speed input: 103.56 toks/s, output: 20.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<42:47, 2.02s/it, est. speed input: 196.68 toks/s, output: 42.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:24<25:10, 1.19s/it, est. speed input: 277.24 toks/s, output: 63.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:25<16:09, 1.30it/s, est. speed input: 355.31 toks/s, output: 85.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:26<10:59, 1.90it/s, est. speed input: 435.16 toks/s, output: 103.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:26<08:03, 2.59it/s, est. speed input: 510.07 toks/s, output: 122.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:27<06:37, 3.13it/s, est. speed input: 579.34 toks/s, output: 139.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:29<07:30, 2.75it/s, est. speed input: 609.69 toks/s, output: 149.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:30<05:26, 3.78it/s, est. speed input: 682.88 toks/s, output: 169.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:30<03:56, 5.20it/s, est. speed input: 753.84 toks/s, output: 192.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:30<02:39, 7.65it/s, est. speed input: 890.17 toks/s, output: 233.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:31<01:27, 13.75it/s, est. speed input: 1110.29 toks/s, output: 296.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:31<01:20, 14.85it/s, est. speed input: 1175.89 toks/s, output: 316.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:32<01:31, 12.97it/s, est. speed input: 1283.39 toks/s, output: 350.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:32<01:47, 11.07it/s, est. speed input: 1330.81 toks/s, output: 367.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:33<01:29, 13.11it/s, est. speed input: 1392.36 toks/s, output: 386.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:33<01:10, 16.69it/s, est. speed input: 1511.19 toks/s, output: 425.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:34<01:12, 16.03it/s, est. speed input: 1608.64 toks/s, output: 452.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:34<01:03, 18.17it/s, est. speed input: 1671.56 toks/s, output: 474.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:34<00:38, 29.89it/s, est. speed input: 1871.90 toks/s, output: 544.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:34<00:36, 30.66it/s, est. speed input: 1985.72 toks/s, output: 589.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:34<00:33, 33.45it/s, est. speed input: 2104.50 toks/s, output: 630.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:35<00:23, 47.26it/s, est. speed input: 2292.85 toks/s, output: 697.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:35<00:22, 49.67it/s, est. speed input: 2418.93 toks/s, output: 742.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:35<00:19, 54.53it/s, est. speed input: 2594.59 toks/s, output: 808.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:35<00:22, 47.50it/s, est. speed input: 2703.24 toks/s, output: 848.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:35<00:22, 46.72it/s, est. speed input: 2811.83 toks/s, output: 887.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:36<00:16, 61.51it/s, est. speed input: 2996.46 toks/s, output: 959.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:36<00:16, 61.39it/s, est. speed input: 3115.65 toks/s, output: 1004.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:36<00:16, 62.22it/s, est. speed input: 3250.92 toks/s, output: 1050.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:36<00:14, 71.72it/s, est. speed input: 3424.16 toks/s, output: 1120.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:36<00:12, 79.07it/s, est. speed input: 3619.80 toks/s, output: 1170.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:36<00:11, 83.35it/s, est. speed input: 3732.64 toks/s, output: 1217.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:36<00:12, 78.35it/s, est. speed input: 3843.79 toks/s, output: 1263.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:37<00:12, 75.06it/s, est. speed input: 3951.32 toks/s, output: 1311.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:37<00:14, 66.42it/s, est. speed input: 4048.46 toks/s, output: 1340.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:37<00:10, 86.21it/s, est. speed input: 4264.69 toks/s, output: 1434.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:37<00:16, 55.64it/s, est. speed input: 4338.31 toks/s, output: 1467.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:37<00:12, 74.76it/s, est. speed input: 4560.25 toks/s, output: 1571.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:38<00:12, 73.21it/s, est. speed input: 4713.19 toks/s, output: 1640.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:38<00:10, 82.58it/s, est. speed input: 4883.79 toks/s, output: 1718.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:38<00:14, 59.00it/s, est. speed input: 4958.73 toks/s, output: 1759.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:38<00:10, 78.53it/s, est. speed input: 5178.95 toks/s, output: 1849.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:38<00:09, 90.39it/s, est. speed input: 5396.04 toks/s, output: 1960.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:38<00:08, 98.04it/s, est. speed input: 5558.41 toks/s, output: 2023.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:39<00:09, 79.80it/s, est. speed input: 5701.11 toks/s, output: 2097.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:39<00:08, 89.25it/s, est. speed input: 5859.45 toks/s, output: 2157.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:39<00:07, 97.96it/s, est. speed input: 6020.44 toks/s, output: 2221.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:39<00:06, 119.14it/s, est. speed input: 6291.41 toks/s, output: 2344.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:39<00:07, 102.41it/s, est. speed input: 6435.56 toks/s, output: 2414.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:40<00:07, 93.47it/s, est. speed input: 6564.86 toks/s, output: 2476.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:40<00:07, 93.35it/s, est. speed input: 6709.12 toks/s, output: 2555.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:40<00:07, 89.80it/s, est. speed input: 6809.16 toks/s, output: 2606.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:40<00:08, 77.80it/s, est. speed input: 6896.37 toks/s, output: 2660.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:40<00:09, 70.81it/s, est. speed input: 6980.64 toks/s, output: 2716.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:40<00:08, 74.99it/s, est. speed input: 7124.30 toks/s, output: 2787.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:41<00:10, 58.58it/s, est. speed input: 7189.47 toks/s, output: 2833.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:41<00:07, 81.16it/s, est. speed input: 7384.37 toks/s, output: 2937.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:41<00:06, 99.99it/s, est. speed input: 7597.16 toks/s, output: 3054.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:41<00:05, 111.00it/s, est. speed input: 7789.45 toks/s, output: 3151.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:41<00:04, 119.71it/s, est. speed input: 7979.54 toks/s, output: 3267.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:41<00:04, 116.48it/s, est. speed input: 8124.52 toks/s, output: 3340.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:42<00:04, 106.61it/s, est. speed input: 8252.34 toks/s, output: 3412.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:42<00:04, 108.89it/s, est. speed input: 8399.99 toks/s, output: 3503.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:42<00:04, 111.73it/s, est. speed input: 8552.77 toks/s, output: 3577.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:42<00:05, 84.54it/s, est. speed input: 8668.73 toks/s, output: 3650.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:42<00:05, 87.44it/s, est. speed input: 8804.96 toks/s, output: 3736.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:42<00:05, 89.76it/s, est. speed input: 8930.66 toks/s, output: 3818.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:42<00:03, 117.78it/s, est. speed input: 9174.24 toks/s, output: 3977.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:43<00:03, 131.56it/s, est. speed input: 9376.66 toks/s, output: 4095.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:43<00:02, 157.74it/s, est. speed input: 9621.07 toks/s, output: 4240.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:43<00:02, 164.71it/s, est. speed input: 9813.60 toks/s, output: 4360.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:43<00:02, 160.37it/s, est. speed input: 10001.71 toks/s, output: 4475.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:43<00:01, 177.78it/s, est. speed input: 10254.51 toks/s, output: 4628.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:43<00:01, 153.40it/s, est. speed input: 10427.07 toks/s, output: 4750.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:43<00:01, 140.14it/s, est. speed input: 10594.73 toks/s, output: 4866.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:44<00:02, 120.82it/s, est. speed input: 10705.74 toks/s, output: 4962.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:44<00:02, 115.08it/s, est. speed input: 10821.02 toks/s, output: 5066.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:44<00:01, 138.18it/s, est. speed input: 11040.64 toks/s, output: 5254.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:44<00:01, 121.84it/s, est. speed input: 11164.79 toks/s, output: 5340.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:44<00:01, 115.84it/s, est. speed input: 11325.36 toks/s, output: 5491.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:44<00:01, 121.70it/s, est. speed input: 11454.19 toks/s, output: 5607.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:44<00:01, 127.06it/s, est. speed input: 11663.51 toks/s, output: 5781.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:45<00:01, 88.13it/s, est. speed input: 11737.07 toks/s, output: 5858.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:45<00:01, 107.61it/s, est. speed input: 11961.47 toks/s, output: 6061.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:45<00:00, 101.98it/s, est. speed input: 12110.04 toks/s, output: 6193.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:45<00:00, 91.88it/s, est. speed input: 12208.52 toks/s, output: 6270.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:46<00:00, 70.36it/s, est. speed input: 12272.05 toks/s, output: 6363.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:46<00:00, 86.78it/s, est. speed input: 12442.32 toks/s, output: 6520.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:47<00:00, 53.55it/s, est. speed input: 12473.67 toks/s, output: 6623.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:48<00:00, 27.81it/s, est. speed input: 12280.21 toks/s, output: 6560.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:50<00:00, 14.53it/s, est. speed input: 11906.65 toks/s, output: 6409.68 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:50<00:00, 25.55it/s, est. speed input: 11906.65 toks/s, output: 6409.68 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.4484691321849823, 'actor/pg_clipfrac': 0.0010309278732165694, 'actor/ppo_kl': -0.0015777254011482}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.000218531844438985, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00016874729772098362}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.20811976492404938, 'actor/pg_clipfrac': 0.0013386880746111274, 'actor/ppo_kl': 0.0008299570763483644}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.21614637970924377, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011039053788408637}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.10305943340063095, 'actor/pg_clipfrac': 0.00283286115154624, 'actor/ppo_kl': -4.08107771363575e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00012032396625727415, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00017116959497798234}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00010161053069168702, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013450583210214972}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.39453235268592834, 'actor/pg_clipfrac': 0.0015105740167200565, 'actor/ppo_kl': 0.002201956231147051}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.027004774659872055, 'actor/pg_clipfrac': 0.0008216926944442093, 'actor/ppo_kl': 0.0014831273583695292}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.25955238938331604, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017489870078861713}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.04786151275038719, 'actor/pg_clipfrac': 0.000596302910707891, 'actor/ppo_kl': 0.000529591809026897}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.012845117598772049, 'actor/pg_clipfrac': 0.0013404826167970896, 'actor/ppo_kl': 0.00034775619860738516}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.621489942073822, 'actor/pg_clipfrac': 0.002661934355273843, 'actor/ppo_kl': -0.002399011282250285}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.5042009949684143, 'actor/pg_clipfrac': 0.0018621974159032106, 'actor/ppo_kl': -0.000899039616342634}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.10767512768507004, 'actor/pg_clipfrac': 0.003911342937499285, 'actor/ppo_kl': -0.002015373669564724}
[36m(Runner pid=3309020)[0m Step 11
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.301
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.016
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.018
[36m(Runner pid=3309020)[0m ppo_kl: 6.991420961046657e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.029
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.029
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.617
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.617
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 919139
[36m(Runner pid=3309020)[0m balanced_min: 919139
[36m(Runner pid=3309020)[0m max: 921257
[36m(Runner pid=3309020)[0m mean: 919139.0
[36m(Runner pid=3309020)[0m min: 917021
[36m(Runner pid=3309020)[0m minmax_diff: 4236
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.521
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.113
[36m(Runner pid=3309020)[0m throughput: 1098.827
[36m(Runner pid=3309020)[0m time_per_step: 836.473
[36m(Runner pid=3309020)[0m total_num_tokens: 1838278
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 814.0
[36m(Runner pid=3309020)[0m mean: 465.461
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1374.0
[36m(Runner pid=3309020)[0m mean: 252.616
[36m(Runner pid=3309020)[0m min: 48.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.244
[36m(Runner pid=3309020)[0m format: 0.986
[36m(Runner pid=3309020)[0m overall: 0.617
[36m(Runner pid=3309020)[0m tag_reward: 0.995
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.154
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.305
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.199
[36m(Runner pid=3309020)[0m gen: 99.552
[36m(Runner pid=3309020)[0m old: 83.209
[36m(Runner pid=3309020)[0m ref: 86.478
[36m(Runner pid=3309020)[0m reward: 5.671
[36m(Runner pid=3309020)[0m step: 836.473
[36m(Runner pid=3309020)[0m update_actor: 560.642
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 12; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.65 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:16:26 [executor_base.py:219] It took 0.338295 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.57 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:17:45 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:16:26 [executor_base.py:219] It took 0.337923 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:17:45 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:17:45 [executor_base.py:208] It took 0.328185 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.77 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.85 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:17:49 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:17:49 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:17:49 [executor_base.py:208] It took 0.326339 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00010071343422168866, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.05157141759991646, 'actor/pg_clipfrac': 0.004934210330247879, 'actor/ppo_kl': 0.0019465936347842216}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.6518357992172241, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00015458920097444206, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.04666798934340477, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00011175662802997977, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008691565017215908}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00016298700938932598, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.44731950759887695, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00012456084368750453, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.30968475341796875, 'actor/pg_clipfrac': 0.0015197568573057652, 'actor/ppo_kl': -0.00034906025393866}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.1579657942056656, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.009239924140274525, 'actor/pg_clipfrac': 0.0031104199588298798, 'actor/ppo_kl': 7.155672210501507e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00015905722102615982, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005824279505759478}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.35580766201019287, 'actor/pg_clipfrac': 0.0011890606256201863, 'actor/ppo_kl': 0.0010151584865525365}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.2539747357368469, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00025003505288623273}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.04499140754342079, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.31395480036735535, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001429026888217777}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.23113419115543365, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005864539416506886}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.10033097118139267, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010640814434736967}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.3249637186527252, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002460685500409454}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.4795515835285187, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017457978101447225}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.5422730445861816, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003540054021868855}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00023295047867577523, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00088399468222633}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.43797799944877625, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001071222242899239}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.2638087272644043, 'actor/pg_clipfrac': 0.0017361111240461469, 'actor/ppo_kl': -0.0012799783144146204}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.3546102046966553, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00014260425814427435}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.7311646938323975, 'actor/pg_clipfrac': 0.000928505090996623, 'actor/ppo_kl': -0.00019221362890675664}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.0653708353638649, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017355680465698242}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.07082518935203552, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001763811451382935}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.34813758730888367, 'actor/pg_clipfrac': 0.001835985342040658, 'actor/ppo_kl': 0.0008243948104791343}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.43784958124160767, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002707910491153598}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.29730626940727234, 'actor/pg_clipfrac': 0.0033783784601837397, 'actor/ppo_kl': 0.00032586019369773567}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1236041784286499, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.500202173105208e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.3887133002281189, 'actor/pg_clipfrac': 0.001043841359205544, 'actor/ppo_kl': 0.0014085371512919664}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.10640706866979599, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001645170501433313}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00014417828060686588, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009172427817247808}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.47613459825515747, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011499112006276846}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.1007286012172699, 'actor/pg_clipfrac': 0.0024096386041492224, 'actor/ppo_kl': 0.001048313220962882}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3195401132106781, 'actor/pg_clipfrac': 0.001721170381642878, 'actor/ppo_kl': -0.00015183283539954573}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.23706746101379395, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001444135676138103}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00013928188127465546, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014954430516809225}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00014443315740209073, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000327467656461522}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.4869544804096222, 'actor/pg_clipfrac': 0.0009433962404727936, 'actor/ppo_kl': 5.65366935916245e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2493409514427185, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008013849146664143}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:30:30, 4.26s/it, est. speed input: 106.13 toks/s, output: 23.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<42:26, 2.00s/it, est. speed input: 195.91 toks/s, output: 44.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:23<15:55, 1.32it/s, est. speed input: 385.11 toks/s, output: 81.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:23<11:13, 1.86it/s, est. speed input: 479.36 toks/s, output: 102.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:24<08:33, 2.43it/s, est. speed input: 555.77 toks/s, output: 119.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:26<08:48, 2.36it/s, est. speed input: 599.14 toks/s, output: 130.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:27<06:50, 3.02it/s, est. speed input: 671.99 toks/s, output: 148.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:29<05:17, 3.88it/s, est. speed input: 789.02 toks/s, output: 175.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:30<05:12, 3.92it/s, est. speed input: 851.55 toks/s, output: 189.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:30<04:28, 4.54it/s, est. speed input: 906.07 toks/s, output: 205.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:31<03:29, 5.79it/s, est. speed input: 977.78 toks/s, output: 224.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:31<02:05, 9.63it/s, est. speed input: 1122.22 toks/s, output: 262.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:31<01:43, 11.55it/s, est. speed input: 1191.59 toks/s, output: 284.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:31<01:31, 13.12it/s, est. speed input: 1256.93 toks/s, output: 301.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:31<01:20, 14.71it/s, est. speed input: 1323.93 toks/s, output: 322.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:32<01:08, 17.37it/s, est. speed input: 1382.35 toks/s, output: 341.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:32<00:49, 23.65it/s, est. speed input: 1517.22 toks/s, output: 383.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:32<01:14, 15.71it/s, est. speed input: 1556.03 toks/s, output: 401.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:33<00:41, 27.63it/s, est. speed input: 1759.38 toks/s, output: 466.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:33<00:40, 28.01it/s, est. speed input: 1875.42 toks/s, output: 504.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:33<00:38, 29.26it/s, est. speed input: 1939.28 toks/s, output: 529.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:34<01:21, 13.83it/s, est. speed input: 1990.37 toks/s, output: 554.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:35<01:22, 13.69it/s, est. speed input: 2037.01 toks/s, output: 571.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:35<01:06, 16.84it/s, est. speed input: 2146.47 toks/s, output: 610.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:35<00:57, 19.23it/s, est. speed input: 2206.12 toks/s, output: 632.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:36<01:01, 17.83it/s, est. speed input: 2248.69 toks/s, output: 649.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:36<00:52, 21.03it/s, est. speed input: 2350.60 toks/s, output: 694.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:36<00:37, 29.13it/s, est. speed input: 2521.91 toks/s, output: 761.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:36<00:34, 31.12it/s, est. speed input: 2571.40 toks/s, output: 785.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:37<00:38, 27.66it/s, est. speed input: 2617.64 toks/s, output: 806.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:37<00:30, 34.74it/s, est. speed input: 2774.11 toks/s, output: 859.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:37<00:16, 63.43it/s, est. speed input: 3072.14 toks/s, output: 976.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:37<00:13, 73.69it/s, est. speed input: 3306.52 toks/s, output: 1067.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:37<00:12, 79.87it/s, est. speed input: 3478.73 toks/s, output: 1132.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:38<00:15, 64.48it/s, est. speed input: 3628.13 toks/s, output: 1204.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:38<00:13, 72.75it/s, est. speed input: 3792.43 toks/s, output: 1280.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:38<00:11, 80.53it/s, est. speed input: 3959.01 toks/s, output: 1364.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:38<00:10, 88.69it/s, est. speed input: 4113.58 toks/s, output: 1428.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:38<00:10, 88.72it/s, est. speed input: 4272.61 toks/s, output: 1493.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:38<00:08, 105.55it/s, est. speed input: 4491.85 toks/s, output: 1592.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:39<00:09, 94.01it/s, est. speed input: 4645.48 toks/s, output: 1656.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:39<00:07, 112.15it/s, est. speed input: 4922.94 toks/s, output: 1789.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:39<00:08, 99.71it/s, est. speed input: 5071.84 toks/s, output: 1862.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:39<00:07, 105.64it/s, est. speed input: 5234.50 toks/s, output: 1937.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:39<00:10, 80.66it/s, est. speed input: 5357.01 toks/s, output: 1978.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:40<00:08, 92.30it/s, est. speed input: 5518.29 toks/s, output: 2068.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:40<00:07, 106.04it/s, est. speed input: 5727.52 toks/s, output: 2171.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:40<00:06, 114.91it/s, est. speed input: 5885.52 toks/s, output: 2252.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:40<00:08, 85.38it/s, est. speed input: 6015.02 toks/s, output: 2322.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:40<00:09, 80.07it/s, est. speed input: 6146.63 toks/s, output: 2397.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:40<00:06, 115.07it/s, est. speed input: 6461.10 toks/s, output: 2541.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:41<00:05, 120.27it/s, est. speed input: 6666.88 toks/s, output: 2641.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:41<00:06, 104.47it/s, est. speed input: 6802.15 toks/s, output: 2707.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:41<00:06, 104.72it/s, est. speed input: 6944.95 toks/s, output: 2773.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:41<00:06, 105.29it/s, est. speed input: 7081.45 toks/s, output: 2859.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:41<00:06, 92.58it/s, est. speed input: 7209.63 toks/s, output: 2925.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:41<00:06, 95.87it/s, est. speed input: 7352.54 toks/s, output: 2988.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:42<00:05, 103.55it/s, est. speed input: 7498.21 toks/s, output: 3064.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:42<00:05, 105.32it/s, est. speed input: 7638.34 toks/s, output: 3149.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:42<00:05, 112.58it/s, est. speed input: 7789.31 toks/s, output: 3231.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:42<00:05, 94.87it/s, est. speed input: 7912.96 toks/s, output: 3284.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:42<00:05, 103.11it/s, est. speed input: 8090.75 toks/s, output: 3388.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:42<00:03, 133.58it/s, est. speed input: 8341.79 toks/s, output: 3531.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:42<00:02, 179.36it/s, est. speed input: 8738.03 toks/s, output: 3770.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:43<00:02, 156.70it/s, est. speed input: 8913.42 toks/s, output: 3887.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:43<00:03, 121.67it/s, est. speed input: 9078.37 toks/s, output: 3996.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:43<00:03, 128.99it/s, est. speed input: 9306.77 toks/s, output: 4124.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:43<00:03, 106.38it/s, est. speed input: 9412.94 toks/s, output: 4199.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:43<00:02, 139.44it/s, est. speed input: 9708.63 toks/s, output: 4395.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:43<00:02, 158.19it/s, est. speed input: 9953.38 toks/s, output: 4555.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:44<00:02, 140.30it/s, est. speed input: 10117.14 toks/s, output: 4676.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:44<00:01, 151.17it/s, est. speed input: 10296.91 toks/s, output: 4805.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:44<00:01, 151.77it/s, est. speed input: 10481.22 toks/s, output: 4939.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:44<00:01, 144.75it/s, est. speed input: 10652.60 toks/s, output: 5066.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:44<00:01, 152.87it/s, est. speed input: 10835.26 toks/s, output: 5192.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:44<00:01, 179.59it/s, est. speed input: 11101.81 toks/s, output: 5399.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:45<00:01, 104.27it/s, est. speed input: 11201.78 toks/s, output: 5491.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:45<00:01, 127.21it/s, est. speed input: 11428.91 toks/s, output: 5684.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:45<00:01, 119.69it/s, est. speed input: 11592.55 toks/s, output: 5828.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:45<00:01, 107.70it/s, est. speed input: 11733.09 toks/s, output: 5944.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:45<00:00, 101.15it/s, est. speed input: 11843.60 toks/s, output: 6025.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:46<00:00, 87.87it/s, est. speed input: 11925.78 toks/s, output: 6105.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:46<00:00, 79.72it/s, est. speed input: 12016.62 toks/s, output: 6194.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:46<00:00, 94.90it/s, est. speed input: 12187.82 toks/s, output: 6343.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:47<00:00, 48.88it/s, est. speed input: 12142.03 toks/s, output: 6365.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:47<00:00, 41.86it/s, est. speed input: 12147.15 toks/s, output: 6407.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:47<00:00, 47.08it/s, est. speed input: 12213.50 toks/s, output: 6472.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:48<00:00, 25.28it/s, est. speed input: 12062.99 toks/s, output: 6456.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:49<00:00, 26.04it/s, est. speed input: 11995.91 toks/s, output: 6461.97 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.08538282662630081, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015529399970546365}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.21354956924915314, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002172125270590186}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.03987099230289459, 'actor/pg_clipfrac': 0.001349527621641755, 'actor/ppo_kl': 0.001967776333913207}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.12439671903848648, 'actor/pg_clipfrac': 0.0029411765281111, 'actor/ppo_kl': 0.0015479200519621372}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.07223845273256302, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014012666651979089}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.048767656087875366, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002756415924523026}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.08620405197143555, 'actor/pg_clipfrac': 0.0038610037881881, 'actor/ppo_kl': -0.003167929360643029}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.37387415766716003, 'actor/pg_clipfrac': 0.0015576323494315147, 'actor/ppo_kl': 0.001950971083715558}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.20478707551956177, 'actor/pg_clipfrac': 0.0020993701182305813, 'actor/ppo_kl': -0.0013494044542312622}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.09661968052387238, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016169457230716944}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002063226274913177, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001643455761950463}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.28577521443367004, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002089375484501943}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.23992234468460083, 'actor/pg_clipfrac': 0.005839415825903416, 'actor/ppo_kl': 0.0017524468712508678}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.08214148133993149, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001290384796448052}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.05596690997481346, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013091269647702575}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.049840811640024185, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013500228524208069}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.7484943866729736, 'actor/pg_clipfrac': 0.003395585808902979, 'actor/ppo_kl': -0.00019573963072616607}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.535560131072998, 'actor/pg_clipfrac': 0.0015105740167200565, 'actor/ppo_kl': -0.0010253958171233535}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.23078039288520813, 'actor/pg_clipfrac': 0.0013717421097680926, 'actor/ppo_kl': 0.0008255235152319074}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.4011652171611786, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00015571182302664965}
[36m(Runner pid=3309020)[0m Step 12
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.288
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.016
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.015
[36m(Runner pid=3309020)[0m ppo_kl: 0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.611
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.611
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 917645
[36m(Runner pid=3309020)[0m balanced_min: 917645
[36m(Runner pid=3309020)[0m max: 917682
[36m(Runner pid=3309020)[0m mean: 917645.0
[36m(Runner pid=3309020)[0m min: 917608
[36m(Runner pid=3309020)[0m minmax_diff: 74
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.437
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.113
[36m(Runner pid=3309020)[0m throughput: 1094.924
[36m(Runner pid=3309020)[0m time_per_step: 838.09
[36m(Runner pid=3309020)[0m total_num_tokens: 1835290
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 807.0
[36m(Runner pid=3309020)[0m mean: 466.74
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1121.0
[36m(Runner pid=3309020)[0m mean: 250.17
[36m(Runner pid=3309020)[0m min: 52.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.229
[36m(Runner pid=3309020)[0m format: 0.991
[36m(Runner pid=3309020)[0m overall: 0.611
[36m(Runner pid=3309020)[0m tag_reward: 0.997
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.408872815186038e-05
[36m(Runner pid=3309020)[0m gen: 0.151
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.306
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.154
[36m(Runner pid=3309020)[0m gen: 96.749
[36m(Runner pid=3309020)[0m old: 84.972
[36m(Runner pid=3309020)[0m ref: 87.055
[36m(Runner pid=3309020)[0m reward: 6.01
[36m(Runner pid=3309020)[0m step: 838.09
[36m(Runner pid=3309020)[0m update_actor: 562.461
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 13; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:30:27 [executor_base.py:219] It took 0.338430 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.70 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:31:49 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:30:27 [executor_base.py:219] It took 0.337233 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:31:49 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.78 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:31:49 [executor_base.py:208] It took 0.326283 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.78 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:31:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:31:58 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:31:58 [executor_base.py:208] It took 0.326410 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1096106469631195, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00014962366549298167, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00010765292245196179, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00039907850441522896}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.47329145669937134, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002685870276764035, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -6.150991976028308e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.12654419243335724, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010491471039131284}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.5313416719436646, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00013065410894341767, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1398337483406067, 'actor/pg_clipfrac': 0.0019960079807788134, 'actor/ppo_kl': 1.9477036403259262e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00018250233551952988, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.2609342336654663, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.139914870262146, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.02833208069205284, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.6249064207077026, 'actor/pg_clipfrac': 0.0014684287598356605, 'actor/ppo_kl': -0.0006670706789009273}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.09019152820110321, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.1019737496972084, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00014267300139181316, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -5.101891019876348e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00014040943642612547, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00047817782615311444}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.11059199273586273, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00022561996593140066}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00011435359192546457, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003678899956867099}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.32351696491241455, 'actor/pg_clipfrac': 0.0023603462614119053, 'actor/ppo_kl': -0.0005833095056004822}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.2751429080963135, 'actor/pg_clipfrac': 0.0022831049282103777, 'actor/ppo_kl': 0.00047763550537638366}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00014511877088807523, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016150765586644411}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.22951890528202057, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00034165196120738983}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00011688621452776715, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002352033043280244}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.6358087062835693, 'actor/pg_clipfrac': 0.0012468828354030848, 'actor/ppo_kl': 0.0005829399451613426}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.7302379012107849, 'actor/pg_clipfrac': 0.00324324332177639, 'actor/ppo_kl': 0.00028779107378795743}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.38470855355262756, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00047319746227003634}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.30083730816841125, 'actor/pg_clipfrac': 0.00223380490206182, 'actor/ppo_kl': -0.0006518644513562322}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.11524281650781631, 'actor/pg_clipfrac': 0.0034512511920183897, 'actor/ppo_kl': -0.0003700963861774653}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.7315279841423035, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009137714514508843}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.2791254222393036, 'actor/pg_clipfrac': 0.0016207455191761255, 'actor/ppo_kl': -0.0003136104787699878}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.30122679471969604, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001132346224039793}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.15733183920383453, 'actor/pg_clipfrac': 0.0024038462433964014, 'actor/ppo_kl': 0.00042680365731939673}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.20697832107543945, 'actor/pg_clipfrac': 0.0013071895809844136, 'actor/ppo_kl': -0.0011473363265395164}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00016572047024965286, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010388881200924516}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.1633794754743576, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00215912121348083}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.08797068893909454, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013033286668360233}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.1741081029176712, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010006232187151909}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.4359046220779419, 'actor/pg_clipfrac': 0.0025662959087640047, 'actor/ppo_kl': -0.00019577979401219636}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.06824422627687454, 'actor/pg_clipfrac': 0.0010940919164568186, 'actor/ppo_kl': -0.00038199350819922984}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.2961955964565277, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00046008615754544735}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.2677897810935974, 'actor/pg_clipfrac': 0.002626970177516341, 'actor/ppo_kl': -0.00014114880468696356}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0001272018998861313, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009568887762725353}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.21941688656806946, 'actor/pg_clipfrac': 0.0006531678372994065, 'actor/ppo_kl': 0.0011452139588072896}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.15965723991394043, 'actor/pg_clipfrac': 0.0013661201810464263, 'actor/ppo_kl': -0.001329450635239482}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.05696466565132141, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.5640258652638295e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00016749498900026083, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00010432676208438352}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.234249547123909, 'actor/pg_clipfrac': 0.0011876485077664256, 'actor/ppo_kl': -0.0002559901913627982}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.05443056672811508, 'actor/pg_clipfrac': 0.0010060361819341779, 'actor/ppo_kl': 4.9896163545781747e-05}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002856724604498595, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010445729130879045}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.04957541823387146, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000558640924282372}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00015493047249037772, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00048066495219245553}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.32189974188804626, 'actor/pg_clipfrac': 0.002801120514050126, 'actor/ppo_kl': 0.0017674389528110623}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.1712772101163864, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000600161962211132}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.008280766196548939, 'actor/pg_clipfrac': 0.0012376237427815795, 'actor/ppo_kl': 0.0013723255833610892}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.4661354422569275, 'actor/pg_clipfrac': 0.0014367816038429737, 'actor/ppo_kl': -0.000202198134502396}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.538703978061676, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000272126606432721}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.010315604507923126, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006806710734963417}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.33492401242256165, 'actor/pg_clipfrac': 0.0021953897085040808, 'actor/ppo_kl': -0.0006683383253403008}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.06104549765586853, 'actor/pg_clipfrac': 0.001623376621864736, 'actor/ppo_kl': 0.0005511773051694036}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.07031361013650894, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.003148010466247797}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.44939932227134705, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011650437954813242}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0001507391716586426, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00048365999828092754}
[36m(Runner pid=3309020)[0m Step 13
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.278
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.024
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: -8.224565959746855e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.014
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.014
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.618
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.618
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 909025
[36m(Runner pid=3309020)[0m balanced_min: 908728
[36m(Runner pid=3309020)[0m max: 910585
[36m(Runner pid=3309020)[0m mean: 908876.5
[36m(Runner pid=3309020)[0m min: 907168
[36m(Runner pid=3309020)[0m minmax_diff: 3417
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.864
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.112
[36m(Runner pid=3309020)[0m throughput: 1070.813
[36m(Runner pid=3309020)[0m time_per_step: 848.772
[36m(Runner pid=3309020)[0m total_num_tokens: 1817753
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 598.0
[36m(Runner pid=3309020)[0m mean: 462.08
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2003.0
[36m(Runner pid=3309020)[0m mean: 247.98
[36m(Runner pid=3309020)[0m min: 41.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.241
[36m(Runner pid=3309020)[0m format: 0.994
[36m(Runner pid=3309020)[0m overall: 0.618
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.164
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.05
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.309
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.22
[36m(Runner pid=3309020)[0m gen: 104.288
[36m(Runner pid=3309020)[0m old: 83.764
[36m(Runner pid=3309020)[0m ref: 91.556
[36m(Runner pid=3309020)[0m reward: 6.367
[36m(Runner pid=3309020)[0m step: 848.772
[36m(Runner pid=3309020)[0m update_actor: 562.009
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 14; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:17<1:15:21, 3.55s/it, est. speed input: 128.03 toks/s, output: 18.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:18<33:41, 1.59s/it, est. speed input: 244.35 toks/s, output: 40.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:21<23:51, 1.13s/it, est. speed input: 312.66 toks/s, output: 58.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:22<15:48, 1.33it/s, est. speed input: 400.03 toks/s, output: 77.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:23<10:41, 1.96it/s, est. speed input: 486.92 toks/s, output: 98.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:24<08:42, 2.39it/s, est. speed input: 553.84 toks/s, output: 115.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:27<10:37, 1.95it/s, est. speed input: 564.48 toks/s, output: 120.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:28<08:25, 2.46it/s, est. speed input: 623.60 toks/s, output: 136.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:29<06:56, 2.97it/s, est. speed input: 681.46 toks/s, output: 155.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:29<05:02, 4.07it/s, est. speed input: 749.12 toks/s, output: 174.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:29<03:41, 5.53it/s, est. speed input: 820.93 toks/s, output: 190.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:30<03:13, 6.29it/s, est. speed input: 882.13 toks/s, output: 206.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:30<02:37, 7.72it/s, est. speed input: 945.18 toks/s, output: 226.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:31<01:31, 13.08it/s, est. speed input: 1146.51 toks/s, output: 283.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:31<01:23, 14.33it/s, est. speed input: 1210.84 toks/s, output: 302.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:31<01:01, 19.24it/s, est. speed input: 1340.65 toks/s, output: 344.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:31<00:55, 21.13it/s, est. speed input: 1405.54 toks/s, output: 365.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:32<00:50, 23.16it/s, est. speed input: 1470.90 toks/s, output: 386.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:32<00:50, 23.05it/s, est. speed input: 1529.27 toks/s, output: 404.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:32<00:50, 23.21it/s, est. speed input: 1586.04 toks/s, output: 421.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:33<01:12, 15.94it/s, est. speed input: 1680.89 toks/s, output: 449.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:33<01:17, 14.79it/s, est. speed input: 1728.12 toks/s, output: 464.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:33<01:05, 17.40it/s, est. speed input: 1792.15 toks/s, output: 487.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:34<00:47, 23.79it/s, est. speed input: 1915.27 toks/s, output: 528.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:34<00:46, 24.13it/s, est. speed input: 1966.83 toks/s, output: 544.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:34<00:53, 21.12it/s, est. speed input: 2013.16 toks/s, output: 560.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:34<00:30, 36.67it/s, est. speed input: 2200.63 toks/s, output: 619.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:35<00:28, 38.22it/s, est. speed input: 2312.83 toks/s, output: 666.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:35<00:20, 53.70it/s, est. speed input: 2516.50 toks/s, output: 729.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:35<00:17, 62.33it/s, est. speed input: 2690.14 toks/s, output: 789.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:35<00:15, 67.27it/s, est. speed input: 2816.28 toks/s, output: 831.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:35<00:20, 52.08it/s, est. speed input: 2975.91 toks/s, output: 897.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:36<00:20, 49.75it/s, est. speed input: 3079.18 toks/s, output: 930.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:36<00:25, 39.75it/s, est. speed input: 3172.13 toks/s, output: 967.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:36<00:25, 40.61it/s, est. speed input: 3223.87 toks/s, output: 987.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:36<00:20, 49.35it/s, est. speed input: 3398.88 toks/s, output: 1044.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:37<00:21, 46.11it/s, est. speed input: 3499.99 toks/s, output: 1091.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:37<00:19, 50.41it/s, est. speed input: 3608.63 toks/s, output: 1132.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:37<00:18, 53.71it/s, est. speed input: 3772.11 toks/s, output: 1184.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:37<00:09, 94.45it/s, est. speed input: 4179.54 toks/s, output: 1357.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:37<00:10, 89.23it/s, est. speed input: 4341.90 toks/s, output: 1429.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:38<00:10, 88.42it/s, est. speed input: 4605.54 toks/s, output: 1542.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:38<00:11, 79.92it/s, est. speed input: 4702.38 toks/s, output: 1588.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:38<00:11, 78.96it/s, est. speed input: 4814.62 toks/s, output: 1638.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:38<00:08, 95.93it/s, est. speed input: 5040.59 toks/s, output: 1725.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:38<00:09, 88.08it/s, est. speed input: 5194.22 toks/s, output: 1799.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:38<00:08, 95.73it/s, est. speed input: 5348.36 toks/s, output: 1858.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:39<00:09, 88.49it/s, est. speed input: 5498.88 toks/s, output: 1932.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:39<00:10, 76.62it/s, est. speed input: 5589.55 toks/s, output: 1979.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:39<00:15, 52.24it/s, est. speed input: 5656.02 toks/s, output: 2013.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:40<00:17, 44.26it/s, est. speed input: 5763.41 toks/s, output: 2081.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:40<00:18, 41.44it/s, est. speed input: 5832.32 toks/s, output: 2123.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:40<00:08, 86.34it/s, est. speed input: 6373.54 toks/s, output: 2392.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:40<00:08, 82.39it/s, est. speed input: 6515.02 toks/s, output: 2467.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:40<00:08, 79.53it/s, est. speed input: 6595.36 toks/s, output: 2523.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:41<00:07, 92.16it/s, est. speed input: 6803.23 toks/s, output: 2635.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:41<00:06, 103.47it/s, est. speed input: 7001.46 toks/s, output: 2738.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:41<00:05, 109.51it/s, est. speed input: 7157.53 toks/s, output: 2816.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:41<00:04, 149.06it/s, est. speed input: 7474.36 toks/s, output: 2980.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:41<00:04, 117.54it/s, est. speed input: 7649.48 toks/s, output: 3080.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:41<00:04, 115.13it/s, est. speed input: 7795.48 toks/s, output: 3175.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:41<00:04, 128.43it/s, est. speed input: 7989.27 toks/s, output: 3298.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:42<00:04, 129.51it/s, est. speed input: 8122.86 toks/s, output: 3358.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:42<00:04, 119.58it/s, est. speed input: 8256.88 toks/s, output: 3446.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:42<00:04, 114.53it/s, est. speed input: 8392.44 toks/s, output: 3508.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:42<00:03, 122.52it/s, est. speed input: 8533.34 toks/s, output: 3597.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:42<00:03, 124.00it/s, est. speed input: 8722.10 toks/s, output: 3714.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:42<00:03, 124.67it/s, est. speed input: 8862.77 toks/s, output: 3802.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:42<00:03, 115.79it/s, est. speed input: 9034.86 toks/s, output: 3892.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:43<00:02, 139.83it/s, est. speed input: 9372.73 toks/s, output: 4107.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:43<00:02, 135.17it/s, est. speed input: 9554.55 toks/s, output: 4227.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:43<00:01, 181.11it/s, est. speed input: 9942.80 toks/s, output: 4439.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:43<00:02, 147.59it/s, est. speed input: 10102.03 toks/s, output: 4534.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:43<00:01, 165.17it/s, est. speed input: 10342.05 toks/s, output: 4718.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:44<00:02, 129.43it/s, est. speed input: 10497.88 toks/s, output: 4838.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:44<00:02, 92.15it/s, est. speed input: 10614.68 toks/s, output: 4911.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:44<00:02, 111.87it/s, est. speed input: 10858.00 toks/s, output: 5122.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:44<00:02, 103.78it/s, est. speed input: 10972.08 toks/s, output: 5228.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:44<00:01, 104.56it/s, est. speed input: 11089.78 toks/s, output: 5318.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:44<00:01, 132.46it/s, est. speed input: 11363.23 toks/s, output: 5557.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:45<00:01, 142.72it/s, est. speed input: 11547.80 toks/s, output: 5691.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:45<00:00, 143.26it/s, est. speed input: 11717.11 toks/s, output: 5854.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:45<00:00, 150.42it/s, est. speed input: 11906.49 toks/s, output: 6009.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:45<00:00, 119.30it/s, est. speed input: 12033.05 toks/s, output: 6152.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:45<00:00, 93.86it/s, est. speed input: 12114.59 toks/s, output: 6231.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:46<00:00, 86.29it/s, est. speed input: 12212.12 toks/s, output: 6325.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:46<00:00, 85.54it/s, est. speed input: 12309.43 toks/s, output: 6419.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:46<00:00, 64.23it/s, est. speed input: 12353.83 toks/s, output: 6504.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:46<00:00, 51.53it/s, est. speed input: 12366.85 toks/s, output: 6548.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:48<00:00, 25.74it/s, est. speed input: 12200.23 toks/s, output: 6522.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:49<00:00, 12.09it/s, est. speed input: 11817.84 toks/s, output: 6338.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:49<00:00, 25.66it/s, est. speed input: 11843.24 toks/s, output: 6373.37 toks/s]
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.68 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:44:38 [executor_base.py:219] It took 0.338627 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.60 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:45:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:44:38 [executor_base.py:219] It took 0.342354 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:45:59 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:45:59 [executor_base.py:208] It took 0.329648 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:46:00 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:46:01 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:46:01 [executor_base.py:208] It took 0.346081 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00016703233995940536, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.26399311423301697, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0001550379383843392, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00042121222941204906}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.013114189729094505, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.03135617822408676, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.44735363125801086, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3516959846019745, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1782781034708023, 'actor/pg_clipfrac': 0.0015408321050927043, 'actor/ppo_kl': 0.000863109074998647}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.09872923791408539, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00012357215746305883, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012007249752059579}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00019448163220658898, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.09559442102909088, 'actor/pg_clipfrac': 0.0012360939290374517, 'actor/ppo_kl': -7.296964213310275e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.2797728478908539, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.10657642781734467, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0014409029390662909, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.6956983804702759, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002621287712827325, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007328195497393608}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.6157163977622986, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00047461732174269855}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.08949058502912521, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.160294287838042e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.09163996577262878, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011162747396156192}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.2481367588043213, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001101877074688673}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00021294863836374134, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005837950157001615}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00010865917283808812, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001072016020771116}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00012675777543336153, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006528689991682768}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.417121022939682, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014028529403731227}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.5171017646789551, 'actor/pg_clipfrac': 0.0017605633474886417, 'actor/ppo_kl': 0.0005358087946660817}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3307346701622009, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015921826707199216}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.24808982014656067, 'actor/pg_clipfrac': 0.003809523768723011, 'actor/ppo_kl': -0.00026936305221170187}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.3443879187107086, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009962707990780473}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.14597299695014954, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008722217753529549}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.4887939691543579, 'actor/pg_clipfrac': 0.0022675737272948027, 'actor/ppo_kl': 0.0024001344572752714}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.3747040033340454, 'actor/pg_clipfrac': 0.0024038462433964014, 'actor/ppo_kl': 0.00042164785554632545}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.23674263060092926, 'actor/pg_clipfrac': 0.003284072270616889, 'actor/ppo_kl': -0.002012847922742367}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.1027788296341896, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00209582457318902}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00028678899980150163, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005018501542508602}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00017127640603575855, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013017632300034165}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00528278574347496, 'actor/pg_clipfrac': 0.0009727626456879079, 'actor/ppo_kl': -0.0005787095869891346}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.09378060698509216, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018688476411625743}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00023208266065921634, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008950577466748655}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.15068267285823822, 'actor/pg_clipfrac': 0.000974658876657486, 'actor/ppo_kl': 0.0012619044864550233}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.448040246963501, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008771428256295621}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.36358723044395447, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012717230711132288}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2487301081418991, 'actor/pg_clipfrac': 0.002366863889619708, 'actor/ppo_kl': -0.0006196196773089468}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.8612154722213745, 'actor/pg_clipfrac': 0.002364066196605563, 'actor/ppo_kl': 0.0008990116184577346}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.6144279837608337, 'actor/pg_clipfrac': 0.004819277208298445, 'actor/ppo_kl': -0.0004319409199524671}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.025616103783249855, 'actor/pg_clipfrac': 0.004640371073037386, 'actor/ppo_kl': 0.002295268466696143}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.12293483316898346, 'actor/pg_clipfrac': 0.002209944650530815, 'actor/ppo_kl': 0.0007819644524715841}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3689359128475189, 'actor/pg_clipfrac': 0.004385964944958687, 'actor/ppo_kl': 0.0006724881823174655}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00024117391149047762, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016833411063998938}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00012409687042236328, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000176089204614982}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.29325252771377563, 'actor/pg_clipfrac': 0.0015479875728487968, 'actor/ppo_kl': -0.0026837459299713373}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.44113868474960327, 'actor/pg_clipfrac': 0.0014144271844998002, 'actor/ppo_kl': -0.0015761633403599262}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5310549736022949, 'actor/pg_clipfrac': 0.002453987719491124, 'actor/ppo_kl': -0.0009756755316630006}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.25534552335739136, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001186820212751627}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.15264971554279327, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003968712408095598}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.3358234763145447, 'actor/pg_clipfrac': 0.0018484288593754172, 'actor/ppo_kl': 0.0016523315571248531}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.14593173563480377, 'actor/pg_clipfrac': 0.0014992504147812724, 'actor/ppo_kl': 0.0002869401650968939}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.29813769459724426, 'actor/pg_clipfrac': 0.0007610350148752332, 'actor/ppo_kl': 0.0001075764957931824}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5678260326385498, 'actor/pg_clipfrac': 0.002424242440611124, 'actor/ppo_kl': -0.0006257190252654254}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0002375163894612342, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009445518371649086}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00015678796626161784, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004279571585357189}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0001582961849635467, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00045277891331352293}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0001561618410050869, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001998415100388229}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.1148800477385521, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000548427109606564}
[36m(Runner pid=3309020)[0m Step 14
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.313
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.016
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.007
[36m(Runner pid=3309020)[0m ppo_kl: -3.057813702795897e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.012
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.012
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.636
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.636
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 908309
[36m(Runner pid=3309020)[0m balanced_min: 908308
[36m(Runner pid=3309020)[0m max: 908551
[36m(Runner pid=3309020)[0m mean: 908308.5
[36m(Runner pid=3309020)[0m min: 908066
[36m(Runner pid=3309020)[0m minmax_diff: 485
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.691
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.112
[36m(Runner pid=3309020)[0m throughput: 1088.04
[36m(Runner pid=3309020)[0m time_per_step: 834.812
[36m(Runner pid=3309020)[0m total_num_tokens: 1816617
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 569.0
[36m(Runner pid=3309020)[0m mean: 462.773
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1204.0
[36m(Runner pid=3309020)[0m mean: 246.843
[36m(Runner pid=3309020)[0m min: 48.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.277
[36m(Runner pid=3309020)[0m format: 0.993
[36m(Runner pid=3309020)[0m overall: 0.636
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:36:49, 4.56s/it, est. speed input: 101.84 toks/s, output: 23.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:23<25:33, 1.21s/it, est. speed input: 287.68 toks/s, output: 63.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<22:34, 1.08s/it, est. speed input: 325.70 toks/s, output: 74.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:27<15:37, 1.34it/s, est. speed input: 403.63 toks/s, output: 94.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:27<10:47, 1.93it/s, est. speed input: 486.51 toks/s, output: 115.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:28<08:24, 2.47it/s, est. speed input: 546.61 toks/s, output: 130.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:30<08:05, 2.55it/s, est. speed input: 588.32 toks/s, output: 141.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:30<06:09, 3.34it/s, est. speed input: 651.04 toks/s, output: 159.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:31<04:27, 4.60it/s, est. speed input: 722.72 toks/s, output: 179.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:31<02:30, 8.12it/s, est. speed input: 869.72 toks/s, output: 221.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:31<01:37, 12.47it/s, est. speed input: 1012.05 toks/s, output: 261.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:31<00:53, 22.04it/s, est. speed input: 1295.35 toks/s, output: 342.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:32<01:07, 17.57it/s, est. speed input: 1396.77 toks/s, output: 376.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:32<01:12, 16.31it/s, est. speed input: 1447.31 toks/s, output: 390.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:33<01:07, 17.35it/s, est. speed input: 1504.74 toks/s, output: 407.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:33<01:03, 18.46it/s, est. speed input: 1560.87 toks/s, output: 425.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:33<00:55, 20.87it/s, est. speed input: 1619.96 toks/s, output: 446.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:33<00:49, 23.40it/s, est. speed input: 1681.75 toks/s, output: 468.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:33<00:52, 21.95it/s, est. speed input: 1740.28 toks/s, output: 490.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:34<00:45, 24.89it/s, est. speed input: 1860.56 toks/s, output: 534.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:34<00:28, 39.83it/s, est. speed input: 2055.31 toks/s, output: 600.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:34<00:36, 30.43it/s, est. speed input: 2158.53 toks/s, output: 636.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:35<00:31, 35.46it/s, est. speed input: 2280.35 toks/s, output: 679.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:35<00:33, 32.39it/s, est. speed input: 2444.49 toks/s, output: 745.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:35<00:27, 39.09it/s, est. speed input: 2559.35 toks/s, output: 784.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:35<00:24, 43.11it/s, est. speed input: 2675.79 toks/s, output: 834.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:36<00:22, 46.67it/s, est. speed input: 2784.11 toks/s, output: 881.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:36<00:22, 46.20it/s, est. speed input: 2891.44 toks/s, output: 926.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:36<00:14, 69.79it/s, est. speed input: 3132.97 toks/s, output: 1015.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:36<00:12, 84.12it/s, est. speed input: 3308.62 toks/s, output: 1087.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:36<00:18, 53.16it/s, est. speed input: 3452.50 toks/s, output: 1138.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:37<00:19, 49.52it/s, est. speed input: 3552.97 toks/s, output: 1187.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:37<00:13, 72.89it/s, est. speed input: 3834.78 toks/s, output: 1302.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:37<00:16, 58.48it/s, est. speed input: 3976.89 toks/s, output: 1352.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:37<00:16, 57.24it/s, est. speed input: 4073.32 toks/s, output: 1394.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:38<00:16, 57.31it/s, est. speed input: 4176.45 toks/s, output: 1431.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:38<00:14, 61.40it/s, est. speed input: 4287.86 toks/s, output: 1480.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:38<00:12, 73.67it/s, est. speed input: 4454.47 toks/s, output: 1551.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:38<00:10, 84.13it/s, est. speed input: 4621.52 toks/s, output: 1615.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:38<00:10, 83.42it/s, est. speed input: 4722.76 toks/s, output: 1652.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:38<00:09, 91.47it/s, est. speed input: 4984.61 toks/s, output: 1774.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:38<00:06, 126.97it/s, est. speed input: 5333.76 toks/s, output: 1920.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:39<00:07, 102.06it/s, est. speed input: 5477.48 toks/s, output: 1977.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:39<00:10, 74.77it/s, est. speed input: 5608.42 toks/s, output: 2040.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:39<00:12, 63.19it/s, est. speed input: 5694.28 toks/s, output: 2086.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:39<00:07, 98.26it/s, est. speed input: 6033.51 toks/s, output: 2241.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:40<00:07, 101.22it/s, est. speed input: 6188.26 toks/s, output: 2317.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:40<00:06, 109.28it/s, est. speed input: 6548.51 toks/s, output: 2476.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:40<00:06, 106.26it/s, est. speed input: 6691.57 toks/s, output: 2551.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:40<00:09, 69.90it/s, est. speed input: 6783.35 toks/s, output: 2604.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:41<00:09, 68.63it/s, est. speed input: 6905.10 toks/s, output: 2666.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:41<00:10, 64.10it/s, est. speed input: 6989.78 toks/s, output: 2710.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:41<00:07, 87.71it/s, est. speed input: 7262.34 toks/s, output: 2860.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:41<00:08, 74.59it/s, est. speed input: 7378.31 toks/s, output: 2925.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:41<00:07, 84.27it/s, est. speed input: 7519.82 toks/s, output: 3008.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:42<00:07, 78.62it/s, est. speed input: 7631.88 toks/s, output: 3092.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:42<00:05, 103.50it/s, est. speed input: 7904.72 toks/s, output: 3240.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:42<00:05, 100.27it/s, est. speed input: 8040.59 toks/s, output: 3309.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:42<00:04, 105.64it/s, est. speed input: 8175.06 toks/s, output: 3392.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:42<00:04, 106.53it/s, est. speed input: 8316.08 toks/s, output: 3467.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:42<00:04, 114.62it/s, est. speed input: 8550.40 toks/s, output: 3620.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:43<00:03, 122.98it/s, est. speed input: 8738.97 toks/s, output: 3721.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:43<00:03, 118.91it/s, est. speed input: 9000.45 toks/s, output: 3865.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:43<00:02, 139.85it/s, est. speed input: 9248.56 toks/s, output: 4028.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:43<00:02, 143.89it/s, est. speed input: 9427.30 toks/s, output: 4144.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:43<00:02, 128.00it/s, est. speed input: 9600.06 toks/s, output: 4272.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:43<00:02, 147.58it/s, est. speed input: 9854.42 toks/s, output: 4432.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:44<00:02, 157.36it/s, est. speed input: 10087.62 toks/s, output: 4587.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:44<00:01, 147.64it/s, est. speed input: 10349.96 toks/s, output: 4763.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:44<00:01, 142.90it/s, est. speed input: 10520.13 toks/s, output: 4908.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:44<00:01, 150.39it/s, est. speed input: 10758.79 toks/s, output: 5081.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:44<00:01, 144.10it/s, est. speed input: 10935.69 toks/s, output: 5205.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:44<00:01, 155.12it/s, est. speed input: 11107.76 toks/s, output: 5346.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:44<00:01, 148.25it/s, est. speed input: 11284.32 toks/s, output: 5473.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:45<00:01, 136.12it/s, est. speed input: 11447.17 toks/s, output: 5609.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:45<00:00, 157.20it/s, est. speed input: 11681.22 toks/s, output: 5802.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:45<00:01, 87.26it/s, est. speed input: 11766.26 toks/s, output: 5896.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:46<00:01, 71.49it/s, est. speed input: 11824.91 toks/s, output: 5977.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:46<00:01, 68.79it/s, est. speed input: 11915.30 toks/s, output: 6084.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:46<00:01, 69.75it/s, est. speed input: 11981.99 toks/s, output: 6150.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:46<00:01, 66.46it/s, est. speed input: 12030.47 toks/s, output: 6206.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:46<00:00, 69.68it/s, est. speed input: 12102.29 toks/s, output: 6279.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:46<00:00, 67.29it/s, est. speed input: 12155.32 toks/s, output: 6361.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:47<00:00, 76.91it/s, est. speed input: 12258.95 toks/s, output: 6482.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:47<00:00, 36.17it/s, est. speed input: 12178.48 toks/s, output: 6483.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:48<00:00, 33.97it/s, est. speed input: 12191.53 toks/s, output: 6534.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:48<00:00, 34.69it/s, est. speed input: 12218.41 toks/s, output: 6581.12 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:48<00:00, 26.41it/s, est. speed input: 12245.07 toks/s, output: 6609.55 toks/s]
[36m(Runner pid=3309020)[0m tag_reward: 0.997
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.154
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.048
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.308
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.249
[36m(Runner pid=3309020)[0m gen: 97.357
[36m(Runner pid=3309020)[0m old: 83.666
[36m(Runner pid=3309020)[0m ref: 86.443
[36m(Runner pid=3309020)[0m reward: 5.811
[36m(Runner pid=3309020)[0m step: 834.812
[36m(Runner pid=3309020)[0m update_actor: 560.402
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 15; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:58:35 [executor_base.py:219] It took 0.338262 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:59:56 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:58:35 [executor_base.py:219] It took 0.336047 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:59:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 00:59:56 [executor_base.py:208] It took 0.329614 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.82 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:59:56 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:59:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 00:59:56 [executor_base.py:208] It took 0.327584 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.11920419335365295, 'actor/pg_clipfrac': 0.0023255813866853714, 'actor/ppo_kl': -0.0012384570436552167}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00014202103193383664, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016458757454529405}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00019384290499147028, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0019783400930464268}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.34874311089515686, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.04465334862470627, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00027986973873339593, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.8365815877914429, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.27073827385902405, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0001514479226898402, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.1074737086892128, 'actor/pg_clipfrac': 0.0030441400595009327, 'actor/ppo_kl': 0.00103046465665102}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.11500754952430725, 'actor/pg_clipfrac': 0.0026490066666156054, 'actor/ppo_kl': 0.00207862863317132}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.22940899431705475, 'actor/pg_clipfrac': 0.001855287584476173, 'actor/ppo_kl': -0.0004835181753151119}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.24951934814453125, 'actor/pg_clipfrac': 0.0014265335630625486, 'actor/ppo_kl': -0.000580497900955379}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00022118880588095635, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00030777923529967666}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00038897997001186013, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.514079749584198, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.200894296169281, 'actor/pg_clipfrac': 0.0019474197179079056, 'actor/ppo_kl': -0.00026355954469181597}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5462365746498108, 'actor/pg_clipfrac': 0.0015220700297504663, 'actor/ppo_kl': 0.0007254660013131797}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.771567702293396, 'actor/pg_clipfrac': 0.002994012087583542, 'actor/ppo_kl': 0.0007872781134210527}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.3216971755027771, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008395343320444226}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00013382124598138034, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007326955674216151}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.021997276693582535, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009524210472591221}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4470756947994232, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001505760825239122}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00012543020420707762, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005168021307326853}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.10762423276901245, 'actor/pg_clipfrac': 0.0008802816737443209, 'actor/ppo_kl': 0.00022578575590159744}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00016518670599907637, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0027611914556473494}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2543884515762329, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001443475834093988}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0003019524156115949, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002736045280471444}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.10666567832231522, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002973983297124505}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:34:20, 15.06s/it, est. speed input: 30.69 toks/s, output: 4.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:15<24:33, 3.94s/it, est. speed input: 90.26 toks/s, output: 15.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%|▏ | 5/377 [00:15<12:14, 1.98s/it, est. speed input: 148.07 toks/s, output: 25.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 7/377 [00:15<07:37, 1.24s/it, est. speed input: 200.95 toks/s, output: 36.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 12/377 [00:16<03:06, 1.95it/s, est. speed input: 342.54 toks/s, output: 66.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 15/377 [00:16<02:07, 2.84it/s, est. speed input: 426.78 toks/s, output: 84.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 19/377 [00:16<01:21, 4.41it/s, est. speed input: 535.77 toks/s, output: 108.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 22/377 [00:16<01:03, 5.60it/s, est. speed input: 612.53 toks/s, output: 126.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 26/377 [00:16<00:44, 7.96it/s, est. speed input: 720.37 toks/s, output: 154.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 33/377 [00:16<00:25, 13.40it/s, est. speed input: 907.88 toks/s, output: 202.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 38/377 [00:16<00:19, 17.12it/s, est. speed input: 1050.19 toks/s, output: 238.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 43/377 [00:16<00:15, 20.97it/s, est. speed input: 1180.89 toks/s, output: 273.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 47/377 [00:17<00:14, 22.07it/s, est. speed input: 1278.25 toks/s, output: 301.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 54/377 [00:17<00:11, 29.26it/s, est. speed input: 1455.83 toks/s, output: 354.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 59/377 [00:17<00:09, 31.98it/s, est. speed input: 1581.45 toks/s, output: 390.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 64/377 [00:17<00:09, 32.30it/s, est. speed input: 1697.77 toks/s, output: 427.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 74/377 [00:17<00:06, 44.70it/s, est. speed input: 1949.48 toks/s, output: 506.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 80/377 [00:17<00:06, 43.43it/s, est. speed input: 2086.50 toks/s, output: 552.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 85/377 [00:17<00:06, 43.26it/s, est. speed input: 2204.60 toks/s, output: 594.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 90/377 [00:18<00:07, 40.57it/s, est. speed input: 2314.76 toks/s, output: 634.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 96/377 [00:18<00:06, 43.63it/s, est. speed input: 2453.12 toks/s, output: 686.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 106/377 [00:18<00:04, 55.69it/s, est. speed input: 2692.97 toks/s, output: 774.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 114/377 [00:18<00:04, 60.15it/s, est. speed input: 2875.43 toks/s, output: 844.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 121/377 [00:18<00:04, 57.23it/s, est. speed input: 3027.80 toks/s, output: 904.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 129/377 [00:18<00:04, 53.94it/s, est. speed input: 3206.16 toks/s, output: 975.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 137/377 [00:18<00:04, 52.23it/s, est. speed input: 3377.80 toks/s, output: 1046.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 145/377 [00:18<00:04, 56.39it/s, est. speed input: 3554.69 toks/s, output: 1121.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 151/377 [00:19<00:04, 55.33it/s, est. speed input: 3678.28 toks/s, output: 1178.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 158/377 [00:19<00:03, 57.00it/s, est. speed input: 3826.95 toks/s, output: 1246.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 165/377 [00:19<00:03, 58.41it/s, est. speed input: 3974.50 toks/s, output: 1316.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 178/377 [00:19<00:02, 71.98it/s, est. speed input: 4261.57 toks/s, output: 1451.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 186/377 [00:19<00:02, 66.06it/s, est. speed input: 4419.65 toks/s, output: 1531.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 195/377 [00:19<00:02, 71.16it/s, est. speed input: 4615.90 toks/s, output: 1625.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 203/377 [00:19<00:02, 63.14it/s, est. speed input: 4769.90 toks/s, output: 1707.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 213/377 [00:19<00:02, 68.98it/s, est. speed input: 4984.37 toks/s, output: 1817.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 221/377 [00:20<00:02, 60.60it/s, est. speed input: 5129.77 toks/s, output: 1901.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 235/377 [00:20<00:01, 76.77it/s, est. speed input: 5424.79 toks/s, output: 2068.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 244/377 [00:20<00:01, 74.90it/s, est. speed input: 5603.53 toks/s, output: 2171.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 252/377 [00:20<00:01, 72.35it/s, est. speed input: 5758.82 toks/s, output: 2265.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 261/377 [00:20<00:01, 75.40it/s, est. speed input: 5937.81 toks/s, output: 2375.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 276/377 [00:20<00:01, 94.11it/s, est. speed input: 6258.11 toks/s, output: 2568.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 286/377 [00:20<00:01, 90.53it/s, est. speed input: 6449.47 toks/s, output: 2694.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 296/377 [00:21<00:01, 65.00it/s, est. speed input: 6589.77 toks/s, output: 2808.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 304/377 [00:21<00:01, 53.40it/s, est. speed input: 6698.08 toks/s, output: 2899.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 311/377 [00:21<00:01, 50.54it/s, est. speed input: 6802.90 toks/s, output: 2988.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 317/377 [00:21<00:01, 49.23it/s, est. speed input: 6899.43 toks/s, output: 3069.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 323/377 [00:21<00:01, 51.07it/s, est. speed input: 6999.68 toks/s, output: 3155.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 329/377 [00:21<00:00, 49.85it/s, est. speed input: 7090.76 toks/s, output: 3238.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 336/377 [00:21<00:00, 54.00it/s, est. speed input: 7212.50 toks/s, output: 3345.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 342/377 [00:22<00:00, 47.06it/s, est. speed input: 7286.87 toks/s, output: 3426.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 348/377 [00:22<00:00, 43.45it/s, est. speed input: 7363.21 toks/s, output: 3512.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▎| 353/377 [00:22<00:00, 34.52it/s, est. speed input: 7389.61 toks/s, output: 3571.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 357/377 [00:22<00:00, 26.74it/s, est. speed input: 7386.55 toks/s, output: 3611.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:23<00:00, 23.90it/s, est. speed input: 7397.49 toks/s, output: 3661.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 364/377 [00:23<00:00, 18.30it/s, est. speed input: 7360.90 toks/s, output: 3681.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 367/377 [00:23<00:00, 11.40it/s, est. speed input: 7236.92 toks/s, output: 3662.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 369/377 [00:24<00:00, 11.02it/s, est. speed input: 7215.09 toks/s, output: 3683.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:24<00:00, 9.09it/s, est. speed input: 7144.46 toks/s, output: 3682.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:25<00:00, 4.55it/s, est. speed input: 6849.26 toks/s, output: 3573.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [01:03<00:16, 5.36s/it, est. speed input: 2794.30 toks/s, output: 1540.70 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:03<00:00, 5.95it/s, est. speed input: 2813.98 toks/s, output: 1799.05 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00017078456585295498, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010276779066771269}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.16340279579162598, 'actor/pg_clipfrac': 0.001389854121953249, 'actor/ppo_kl': 0.0008380272192880511}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.41038158535957336, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001063334639184177}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.4152173399925232, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017999740084633231}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.15922077000141144, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011670375242829323}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.05971390753984451, 'actor/pg_clipfrac': 0.005376344081014395, 'actor/ppo_kl': 0.0017673758557066321}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.29345783591270447, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007384609780274332}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.6883697509765625, 'actor/pg_clipfrac': 0.0013333333190530539, 'actor/ppo_kl': -0.00019662221893668175}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.06713102757930756, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006733532063663006}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.32793906331062317, 'actor/pg_clipfrac': 0.0010899183107540011, 'actor/ppo_kl': 0.0005905712605454028}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00016626798606012017, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007261003484018147}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00021758592629339546, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011303449282422662}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00015171700215432793, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000135737398522906}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.19957031309604645, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0027772029861807823}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.38993051648139954, 'actor/pg_clipfrac': 0.003033366985619068, 'actor/ppo_kl': -0.0010608715238049626}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.47103798389434814, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007175258942879736}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00034447884536348283, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006799183320254087}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0002665451029315591, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001081839669495821}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.056342627853155136, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007847662782296538}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1554473489522934, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00097677914891392}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.43224310874938965, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005727738607674837}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.44683730602264404, 'actor/pg_clipfrac': 0.003546099178493023, 'actor/ppo_kl': 0.0009275362244807184}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.013783315196633339, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011152077931910753}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3341238796710968, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 5.1557923143263906e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.004008019808679819, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00013526933616958559}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2016598880290985, 'actor/pg_clipfrac': 0.0016583747928962111, 'actor/ppo_kl': -0.0024607612285763025}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0001424436632078141, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017530799377709627}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.30102628469467163, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002733431465458125}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.13647013902664185, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -3.0998970032669604e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.04988512024283409, 'actor/pg_clipfrac': 0.0012886597542092204, 'actor/ppo_kl': 0.00016905351367313415}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5337681770324707, 'actor/pg_clipfrac': 0.0011481055989861488, 'actor/ppo_kl': -0.0017872031312435865}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.0777183324098587, 'actor/pg_clipfrac': 0.0010638297535479069, 'actor/ppo_kl': 0.0003448019560892135}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00013050208508502692, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 9.095066343434155e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.025862308219075203, 'actor/pg_clipfrac': 0.0018744142726063728, 'actor/ppo_kl': 0.0002170087245758623}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.12267424911260605, 'actor/pg_clipfrac': 0.001396648003719747, 'actor/ppo_kl': -0.0011088994797319174}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:12:57 [executor_base.py:219] It took 0.343760 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:12:57 [executor_base.py:219] It took 0.342256 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:14:46 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:14:46 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:14:46 [executor_base.py:208] It took 0.325901 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:14:48 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:14:48 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:14:48 [executor_base.py:208] It took 0.328784 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to understand the geometric properties involved. Since E lies on the vertical line l passing through D, the distance DE is always 2.0 (the same as BD). The perimeter of triangle AEC is the sum of the lengths of AE, EC, and AC. To minimize this perimeter, we need to minimize AE + EC, which implies that E should be the point on the line AC such that AE + EC is minimized. This occurs when E is the point where the perpendicular from A to AC intersects AC, making AE + EC equal to AC. Therefore, the minimum perimeter of triangle AEC is AC + 2.0.\n\n\nThe minimum perimeter of triangle AEC is 16.0 - 2.0 = 14.0.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated clockwise around point A by 90 degrees to obtain triangle AB'C', we can deduce that angle BAC is 90 degrees and angle B'AC' is also 90 degrees. Since B and B' are the corresponding points after rotation, angle BAB' is 90 degrees. This means that triangle AB'B' is also a right triangle with angle B'AB' = 90 degrees.\n\nNow, we know that angle CC'B' is 32.0 degrees. Since B and B' are the corresponding points, angle CB'C' is equal to angle B'C'B = 32.0 degrees.\n\nIn triangle AB'C', we have angle BAC = 90 degrees, angle AB'C' = 90 degrees (from the rotation), and angle CB'C' = 32.0 degrees. The sum of the angles in a triangle is 180 degrees, so we can find angle AC'B' by subtracting the sum of the other two angles from 180 degrees.\n\nTherefore, angle AC'B' = 180 - 90 - 32 = 58 degrees.\n\n\nThe size of angle AC'B' is 58 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that A, B, and D are on circle O, and C is on the diameter BE, we know that angle ADC is 54°. Since A, B, and D are on the circle, angle AEB is the same as angle ADC because they subtend the same arc AD. Therefore, angle AEB is also 54°.\n\n\nThe degree of angle AEB is 54°.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AB is the diameter of circle O, angle A is 35.0 degrees. Since AB is the diameter, angle AOD is a right angle (90 degrees) because the angle subtended by a diameter in a semicircle is always a right angle. Since CD is a tangent to the circle at point D, angle ODC is also a right angle (90 degrees). Therefore, angle COD is the remaining angle in the right triangle ODC, which is 90 degrees - angle C. Since angle AOD is 90 degrees, angle COD is also 90 degrees - 35 degrees = 55 degrees. Thus, angle C is the same as angle COD, which is 55 degrees.\n\n\n55 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nTo solve this problem, let's analyze the given information and apply geometric principles step by step:\n\n1. **Given Information**: \n - Circle O is the circumscribed circle of triangle ABC.\n - AD is the diameter of circle O.\n - EA is a tangent to circle O at point A.\n - Angle EAC = 120.0 degrees.\n\n2. **Geometric Properties**:\n - Since AD is the diameter, angle ADB (and angle ADC) is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle.\n - The tangent EA at point A means that angle EAD is 90 degrees (since the tangent is perpendicular to the radius at the point of tangency).\n\n3. **Angle Calculation**:\n - We know that angle EAC is given as 120.0 degrees.\n - Since EA is tangent to circle O at A, angle EAD is 90 degrees.\n - Therefore, angle CAD = angle EAC - angle EAD = 120.0 degrees - 90.0 degrees = 30.0 degrees.\n - Since AD is the diameter, angle ACD is a right angle (90 degrees) because it subtends the diameter.\n - Therefore, angle BCD = angle ACD - angle CAD = 90 degrees - 30.0 degrees = 60.0 degrees.\n\n4. **Conclusion**:\n - In triangle ABC, angle BCD is 60.0 degrees.\n - Since angle BCD is an exterior angle to triangle ABC at vertex C, it is equal to the sum of the two opposite interior angles, angle ABC and angle BAC.\n - Therefore, angle ABC = angle BCD = 60.0 degrees.\n\nThus, the degree of angle ABC is 60.0 degrees.\n\n\nThe degree of angle ABC is 60.0 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_15/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_15/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_15/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 15
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.27
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.018
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.01
[36m(Runner pid=3309020)[0m ppo_kl: 3.700296970521322e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.013
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.013
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.629
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.629
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 919669
[36m(Runner pid=3309020)[0m balanced_min: 919668
[36m(Runner pid=3309020)[0m max: 925605
[36m(Runner pid=3309020)[0m mean: 919668.5
[36m(Runner pid=3309020)[0m min: 913732
[36m(Runner pid=3309020)[0m minmax_diff: 11873
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.194
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.113
[36m(Runner pid=3309020)[0m throughput: 898.796
[36m(Runner pid=3309020)[0m time_per_step: 1023.223
[36m(Runner pid=3309020)[0m total_num_tokens: 1839337
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 667.0
[36m(Runner pid=3309020)[0m mean: 464.002
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1066.0
[36m(Runner pid=3309020)[0m mean: 254.489
[36m(Runner pid=3309020)[0m min: 55.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.261
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.629
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.212758075065103e-05
[36m(Runner pid=3309020)[0m gen: 0.147
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.306
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.133
[36m(Runner pid=3309020)[0m gen: 95.705
[36m(Runner pid=3309020)[0m old: 82.821
[36m(Runner pid=3309020)[0m ref: 85.242
[36m(Runner pid=3309020)[0m reward: 6.309
[36m(Runner pid=3309020)[0m save_checkpoint: 29.973
[36m(Runner pid=3309020)[0m step: 1023.223
[36m(Runner pid=3309020)[0m update_actor: 562.134
[36m(Runner pid=3309020)[0m validation: 160.315
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.302
[36m(Runner pid=3309020)[0m format_reward: 0.985
[36m(Runner pid=3309020)[0m overall_reward: 0.645
[36m(Runner pid=3309020)[0m reward_score: 0.645
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.989
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_15/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_15/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_15/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m Training Episode 1.
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 16; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:16:01 [executor_base.py:219] It took 0.385314 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:18<1:19:26, 3.74s/it, est. speed input: 115.56 toks/s, output: 24.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:21<40:43, 1.92s/it, est. speed input: 203.56 toks/s, output: 42.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:24<26:07, 1.24s/it, est. speed input: 271.79 toks/s, output: 59.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:24<17:04, 1.23it/s, est. speed input: 358.14 toks/s, output: 79.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:25<11:59, 1.74it/s, est. speed input: 434.00 toks/s, output: 100.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:26<08:41, 2.40it/s, est. speed input: 512.08 toks/s, output: 122.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:26<06:03, 3.43it/s, est. speed input: 591.80 toks/s, output: 140.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:26<04:35, 4.49it/s, est. speed input: 671.45 toks/s, output: 160.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:27<04:22, 4.71it/s, est. speed input: 732.00 toks/s, output: 179.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:28<03:29, 5.88it/s, est. speed input: 804.33 toks/s, output: 197.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:28<02:40, 7.64it/s, est. speed input: 879.56 toks/s, output: 217.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:28<02:19, 8.77it/s, est. speed input: 946.12 toks/s, output: 235.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:31<05:25, 3.73it/s, est. speed input: 929.58 toks/s, output: 232.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:32<04:08, 4.87it/s, est. speed input: 994.23 toks/s, output: 252.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:32<03:03, 6.55it/s, est. speed input: 1060.54 toks/s, output: 277.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:32<02:02, 9.75it/s, est. speed input: 1189.72 toks/s, output: 314.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:53, 10.51it/s, est. speed input: 1247.63 toks/s, output: 334.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:33<00:59, 19.64it/s, est. speed input: 1456.56 toks/s, output: 400.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:33<00:47, 24.36it/s, est. speed input: 1584.35 toks/s, output: 443.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:33<00:56, 20.39it/s, est. speed input: 1630.46 toks/s, output: 459.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:33<00:51, 22.54it/s, est. speed input: 1690.98 toks/s, output: 480.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:34<01:12, 15.70it/s, est. speed input: 1774.96 toks/s, output: 513.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:35<01:06, 17.09it/s, est. speed input: 1824.84 toks/s, output: 537.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:35<00:57, 19.62it/s, est. speed input: 1878.99 toks/s, output: 564.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:35<01:14, 15.25it/s, est. speed input: 1915.82 toks/s, output: 574.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:35<01:01, 18.32it/s, est. speed input: 1970.38 toks/s, output: 598.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:35<00:40, 27.57it/s, est. speed input: 2090.14 toks/s, output: 650.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:36<00:27, 40.69it/s, est. speed input: 2275.61 toks/s, output: 725.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:36<00:24, 44.64it/s, est. speed input: 2448.79 toks/s, output: 797.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:36<00:20, 51.68it/s, est. speed input: 2564.85 toks/s, output: 837.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:36<00:26, 39.92it/s, est. speed input: 2665.10 toks/s, output: 874.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:37<00:30, 34.83it/s, est. speed input: 2767.53 toks/s, output: 906.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:37<00:28, 36.56it/s, est. speed input: 2823.94 toks/s, output: 929.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:37<00:27, 38.45it/s, est. speed input: 2883.62 toks/s, output: 948.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:37<00:15, 66.96it/s, est. speed input: 3121.78 toks/s, output: 1032.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:37<00:19, 52.21it/s, est. speed input: 3219.37 toks/s, output: 1068.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:38<00:19, 51.42it/s, est. speed input: 3326.29 toks/s, output: 1115.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:38<00:14, 65.80it/s, est. speed input: 3559.92 toks/s, output: 1215.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:38<00:13, 71.38it/s, est. speed input: 3773.69 toks/s, output: 1312.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:38<00:10, 88.52it/s, est. speed input: 4062.29 toks/s, output: 1438.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:39<00:13, 70.05it/s, est. speed input: 4202.59 toks/s, output: 1494.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:39<00:09, 94.81it/s, est. speed input: 4538.64 toks/s, output: 1654.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:39<00:08, 108.05it/s, est. speed input: 4760.26 toks/s, output: 1760.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:39<00:11, 71.75it/s, est. speed input: 4878.45 toks/s, output: 1823.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:40<00:12, 66.75it/s, est. speed input: 4974.79 toks/s, output: 1864.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:40<00:09, 84.83it/s, est. speed input: 5191.38 toks/s, output: 1964.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:40<00:09, 89.69it/s, est. speed input: 5339.57 toks/s, output: 2038.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:40<00:09, 84.59it/s, est. speed input: 5484.95 toks/s, output: 2110.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:40<00:08, 89.98it/s, est. speed input: 5681.60 toks/s, output: 2198.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:40<00:06, 113.62it/s, est. speed input: 5947.90 toks/s, output: 2317.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:40<00:05, 135.00it/s, est. speed input: 6209.32 toks/s, output: 2446.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:41<00:05, 134.90it/s, est. speed input: 6415.74 toks/s, output: 2544.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:41<00:04, 147.22it/s, est. speed input: 6714.81 toks/s, output: 2699.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:41<00:04, 145.42it/s, est. speed input: 6910.44 toks/s, output: 2797.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:41<00:06, 105.12it/s, est. speed input: 7077.26 toks/s, output: 2890.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:41<00:04, 134.76it/s, est. speed input: 7387.31 toks/s, output: 3028.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:42<00:04, 127.09it/s, est. speed input: 7570.85 toks/s, output: 3149.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:42<00:04, 123.77it/s, est. speed input: 7707.96 toks/s, output: 3207.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:42<00:03, 151.55it/s, est. speed input: 8021.28 toks/s, output: 3389.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:42<00:03, 156.91it/s, est. speed input: 8267.16 toks/s, output: 3536.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:42<00:03, 152.40it/s, est. speed input: 8459.68 toks/s, output: 3652.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:42<00:03, 124.94it/s, est. speed input: 8635.39 toks/s, output: 3750.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:43<00:04, 103.76it/s, est. speed input: 8747.83 toks/s, output: 3823.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:43<00:04, 105.78it/s, est. speed input: 8874.36 toks/s, output: 3893.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:43<00:04, 108.49it/s, est. speed input: 9009.39 toks/s, output: 3953.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:43<00:04, 102.87it/s, est. speed input: 9130.32 toks/s, output: 4041.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:43<00:03, 127.24it/s, est. speed input: 9371.75 toks/s, output: 4215.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:43<00:01, 186.99it/s, est. speed input: 9773.05 toks/s, output: 4463.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:43<00:01, 170.19it/s, est. speed input: 9994.52 toks/s, output: 4607.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:44<00:01, 221.74it/s, est. speed input: 10449.31 toks/s, output: 4906.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:44<00:01, 200.27it/s, est. speed input: 10674.91 toks/s, output: 5063.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:44<00:01, 138.00it/s, est. speed input: 10862.21 toks/s, output: 5206.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:44<00:01, 155.59it/s, est. speed input: 11102.01 toks/s, output: 5365.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:44<00:01, 163.74it/s, est. speed input: 11307.77 toks/s, output: 5495.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:44<00:00, 165.18it/s, est. speed input: 11531.46 toks/s, output: 5673.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:45<00:00, 151.84it/s, est. speed input: 11697.80 toks/s, output: 5827.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:45<00:01, 94.71it/s, est. speed input: 11798.96 toks/s, output: 5942.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:45<00:01, 101.02it/s, est. speed input: 11914.96 toks/s, output: 6050.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:45<00:01, 89.05it/s, est. speed input: 12008.11 toks/s, output: 6165.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:46<00:00, 89.37it/s, est. speed input: 12155.43 toks/s, output: 6273.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:46<00:00, 64.17it/s, est. speed input: 12192.86 toks/s, output: 6339.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:46<00:00, 66.50it/s, est. speed input: 12255.52 toks/s, output: 6410.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:46<00:00, 54.04it/s, est. speed input: 12274.56 toks/s, output: 6442.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:47<00:00, 51.57it/s, est. speed input: 12317.95 toks/s, output: 6509.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:47<00:00, 54.59it/s, est. speed input: 12371.58 toks/s, output: 6575.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:47<00:00, 30.75it/s, est. speed input: 12284.52 toks/s, output: 6566.91 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:48<00:00, 26.66it/s, est. speed input: 12373.17 toks/s, output: 6649.38 toks/s]
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:17:23 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:16:01 [executor_base.py:219] It took 0.338159 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:17:24 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:17:24 [executor_base.py:208] It took 0.335589 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:17:23 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:17:24 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:17:24 [executor_base.py:208] It took 0.336658 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.1292618066072464, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.4474460184574127, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.10446890443563461, 'actor/pg_clipfrac': 0.0016393442638218403, 'actor/ppo_kl': -0.0012798246461898088}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.000246787560172379, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.0785040631890297, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.26515650749206543, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0020843655802309513}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00020006478007417172, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001429214607924223}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.29195451736450195, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.24414436519145966, 'actor/pg_clipfrac': 0.0012507817009463906, 'actor/ppo_kl': -0.0009394926601089537}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.057385869324207306, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00021892749646212906, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00016124302055686712, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00031444005435332656}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.14146862924098969, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.29922059178352356, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.13359534740447998, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.38127079606056213, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.5087818503379822, 'actor/pg_clipfrac': 0.0033482143189758062, 'actor/ppo_kl': -0.0007403726922348142}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.9364425539970398, 'actor/pg_clipfrac': 0.0015847861068323255, 'actor/ppo_kl': -0.0006478856666944921}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.34757158160209656, 'actor/pg_clipfrac': 0.0012755101779475808, 'actor/ppo_kl': -0.00011722408817149699}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.6283194422721863, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004633114149328321}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.333571195602417, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011427135905250907}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.19685247540473938, 'actor/pg_clipfrac': 0.0012755101779475808, 'actor/ppo_kl': -0.004148821346461773}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00012571815750561655, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004564984410535544}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00012568665260914713, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002848226286005229}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.16097325086593628, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009188560070469975}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.49731823801994324, 'actor/pg_clipfrac': 0.0023068049922585487, 'actor/ppo_kl': -0.00027110311202704906}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00022886770602781326, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00016480385966133326}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00021317867503967136, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002443940145894885}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.32137230038642883, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012273298343643546}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.14990046620368958, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004601802211254835}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.15066702663898468, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005119059351272881}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.30047357082366943, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 2.0768731701537035e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00030867988243699074, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009845185559242964}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.24589955806732178, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007677310495637357}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.4466714560985565, 'actor/pg_clipfrac': 0.009966777637600899, 'actor/ppo_kl': 0.002642786828801036}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.320807546377182, 'actor/pg_clipfrac': 0.002938295714557171, 'actor/ppo_kl': 0.0019745579920709133}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.40662986040115356, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002739883726462722}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4544324278831482, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006183344521559775}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.9328691363334656, 'actor/pg_clipfrac': 0.00249791843816638, 'actor/ppo_kl': 0.0008142409496940672}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.1100822240114212, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016891101840883493}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2912732660770416, 'actor/pg_clipfrac': 0.004777070134878159, 'actor/ppo_kl': 0.0014391704462468624}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.1589096486568451, 'actor/pg_clipfrac': 0.004444444552063942, 'actor/ppo_kl': -0.0012412064243108034}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.08799527585506439, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0020770106930285692}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.17136555910110474, 'actor/pg_clipfrac': 0.0004835590079892427, 'actor/ppo_kl': -0.0008214557892642915}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0001897195470519364, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009062644094228745}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.10895539820194244, 'actor/pg_clipfrac': 0.0010672358330339193, 'actor/ppo_kl': 0.0009466515039093792}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.41406872868537903, 'actor/pg_clipfrac': 0.0012499999720603228, 'actor/ppo_kl': -0.00025055170408450067}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.07966979593038559, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005808698479086161}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00030112863169051707, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012093930272385478}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0589662566781044, 'actor/pg_clipfrac': 0.001124859438277781, 'actor/ppo_kl': -0.001612475491128862}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.4987964332103729, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013916611205786467}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.13489335775375366, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018386186566203833}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.36192750930786133, 'actor/pg_clipfrac': 0.0020040080416947603, 'actor/ppo_kl': 0.000719485164154321}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.08915352821350098, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001060535665601492}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.3218967318534851, 'actor/pg_clipfrac': 0.0013157895300537348, 'actor/ppo_kl': 0.0007551720482297242}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00018954748520627618, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006609529373236}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.06442778557538986, 'actor/pg_clipfrac': 0.0024125452619045973, 'actor/ppo_kl': -0.0016636312939226627}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.5644229054450989, 'actor/pg_clipfrac': 0.0012674271129071712, 'actor/ppo_kl': -0.000992065411992371}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.20660580694675446, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002774510649032891}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1290763020515442, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017935781506821513}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00024433439830318093, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011226220522075891}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.2009120136499405, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003824208106379956}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.1276995688676834, 'actor/pg_clipfrac': 0.0011402508243918419, 'actor/ppo_kl': 0.0008105165907181799}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.3971482813358307, 'actor/pg_clipfrac': 0.0015772870974615216, 'actor/ppo_kl': 0.0003755401121452451}
[36m(Runner pid=3309020)[0m Step 16
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.852
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.026
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: -0.003
[36m(Runner pid=3309020)[0m ppo_kl: -0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.007
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.007
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.624
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.624
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 914522
[36m(Runner pid=3309020)[0m balanced_min: 914521
[36m(Runner pid=3309020)[0m max: 915856
[36m(Runner pid=3309020)[0m mean: 914521.5
[36m(Runner pid=3309020)[0m min: 913187
[36m(Runner pid=3309020)[0m minmax_diff: 2669
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 108.069
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.112
[36m(Runner pid=3309020)[0m throughput: 1084.0
[36m(Runner pid=3309020)[0m time_per_step: 843.655
[36m(Runner pid=3309020)[0m total_num_tokens: 1829043
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 465.211
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1284.0
[36m(Runner pid=3309020)[0m mean: 249.259
[36m(Runner pid=3309020)[0m min: 51.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.254
[36m(Runner pid=3309020)[0m format: 0.993
[36m(Runner pid=3309020)[0m overall: 0.624
[36m(Runner pid=3309020)[0m tag_reward: 0.997
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.958159272357903e-05
[36m(Runner pid=3309020)[0m gen: 0.156
[36m(Runner pid=3309020)[0m old: 0.048
[36m(Runner pid=3309020)[0m ref: 0.048
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.307
[36m(Runner pid=3309020)[0m timing_s:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:31:26, 4.30s/it, est. speed input: 108.07 toks/s, output: 23.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:21<37:54, 1.79s/it, est. speed input: 214.52 toks/s, output: 46.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<27:38, 1.31s/it, est. speed input: 278.38 toks/s, output: 66.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:26<19:13, 1.09it/s, est. speed input: 348.26 toks/s, output: 88.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<17:48, 1.17it/s, est. speed input: 383.17 toks/s, output: 99.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<08:52, 2.34it/s, est. speed input: 526.49 toks/s, output: 144.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:31<05:30, 3.74it/s, est. speed input: 662.74 toks/s, output: 191.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:31<04:43, 4.35it/s, est. speed input: 727.82 toks/s, output: 214.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:31<03:41, 5.54it/s, est. speed input: 796.15 toks/s, output: 234.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:32<02:52, 7.08it/s, est. speed input: 862.24 toks/s, output: 256.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:32<02:19, 8.70it/s, est. speed input: 929.99 toks/s, output: 274.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:32<01:55, 10.51it/s, est. speed input: 995.18 toks/s, output: 293.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:32<01:26, 13.94it/s, est. speed input: 1115.80 toks/s, output: 336.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:11, 16.66it/s, est. speed input: 1240.09 toks/s, output: 375.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:34<01:31, 12.89it/s, est. speed input: 1281.56 toks/s, output: 388.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:34<01:21, 14.54it/s, est. speed input: 1339.54 toks/s, output: 406.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:34<00:57, 20.37it/s, est. speed input: 1458.29 toks/s, output: 441.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:34<00:39, 28.97it/s, est. speed input: 1640.83 toks/s, output: 512.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:34<00:27, 40.83it/s, est. speed input: 1887.19 toks/s, output: 612.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:35<00:21, 52.04it/s, est. speed input: 2077.64 toks/s, output: 680.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:35<00:21, 52.83it/s, est. speed input: 2199.48 toks/s, output: 723.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:35<00:22, 49.57it/s, est. speed input: 2316.46 toks/s, output: 765.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:35<00:28, 38.71it/s, est. speed input: 2417.55 toks/s, output: 804.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:36<00:35, 30.51it/s, est. speed input: 2455.47 toks/s, output: 821.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:36<00:37, 28.55it/s, est. speed input: 2504.74 toks/s, output: 842.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:36<00:28, 37.44it/s, est. speed input: 2629.28 toks/s, output: 888.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:36<00:24, 42.83it/s, est. speed input: 2746.66 toks/s, output: 939.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:37<00:23, 44.36it/s, est. speed input: 2853.21 toks/s, output: 987.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:37<00:14, 70.51it/s, est. speed input: 3099.13 toks/s, output: 1092.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:37<00:14, 69.38it/s, est. speed input: 3205.89 toks/s, output: 1143.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:37<00:18, 53.81it/s, est. speed input: 3300.20 toks/s, output: 1175.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:37<00:15, 64.68it/s, est. speed input: 3477.63 toks/s, output: 1243.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:37<00:17, 56.35it/s, est. speed input: 3578.25 toks/s, output: 1293.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:38<00:14, 67.05it/s, est. speed input: 3750.72 toks/s, output: 1373.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:38<00:11, 84.57it/s, est. speed input: 3979.81 toks/s, output: 1478.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:38<00:11, 80.53it/s, est. speed input: 4190.78 toks/s, output: 1558.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:38<00:09, 96.33it/s, est. speed input: 4424.27 toks/s, output: 1643.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:38<00:10, 87.68it/s, est. speed input: 4585.48 toks/s, output: 1707.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:38<00:10, 86.02it/s, est. speed input: 4686.62 toks/s, output: 1759.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:39<00:09, 94.50it/s, est. speed input: 4845.81 toks/s, output: 1825.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:39<00:08, 96.02it/s, est. speed input: 5058.99 toks/s, output: 1912.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:39<00:13, 60.92it/s, est. speed input: 5175.15 toks/s, output: 1974.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:39<00:11, 71.57it/s, est. speed input: 5330.83 toks/s, output: 2052.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:40<00:09, 85.36it/s, est. speed input: 5537.98 toks/s, output: 2142.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:40<00:09, 81.61it/s, est. speed input: 5684.28 toks/s, output: 2215.88 toks/s]
Processed prompts: 39%|███▉ | 505/1280 [00:40<00:09, 84.74it/s, est. speed input: 5786.19 toks/s, output: 2260.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:40<00:07, 97.79it/s, est. speed input: 5958.85 toks/s, output: 2346.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:40<00:08, 90.56it/s, est. speed input: 6098.89 toks/s, output: 2408.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:40<00:05, 122.50it/s, est. speed input: 6473.23 toks/s, output: 2616.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:41<00:06, 109.95it/s, est. speed input: 6613.15 toks/s, output: 2693.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:41<00:07, 97.01it/s, est. speed input: 6745.24 toks/s, output: 2773.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:41<00:07, 89.67it/s, est. speed input: 6924.08 toks/s, output: 2864.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:41<00:07, 89.39it/s, est. speed input: 7016.63 toks/s, output: 2912.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:41<00:07, 89.20it/s, est. speed input: 7159.79 toks/s, output: 2988.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:41<00:05, 107.18it/s, est. speed input: 7349.39 toks/s, output: 3100.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:42<00:05, 107.32it/s, est. speed input: 7494.58 toks/s, output: 3178.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:42<00:04, 117.91it/s, est. speed input: 7693.78 toks/s, output: 3295.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:42<00:04, 128.11it/s, est. speed input: 7901.54 toks/s, output: 3393.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:42<00:04, 131.00it/s, est. speed input: 8048.09 toks/s, output: 3492.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:42<00:04, 128.55it/s, est. speed input: 8185.53 toks/s, output: 3578.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:42<00:04, 127.41it/s, est. speed input: 8330.33 toks/s, output: 3660.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:42<00:03, 151.21it/s, est. speed input: 8568.05 toks/s, output: 3795.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:42<00:03, 142.92it/s, est. speed input: 8741.23 toks/s, output: 3893.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:43<00:02, 162.06it/s, est. speed input: 8978.83 toks/s, output: 4053.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:43<00:02, 166.27it/s, est. speed input: 9169.55 toks/s, output: 4183.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:43<00:02, 170.27it/s, est. speed input: 9363.06 toks/s, output: 4291.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:43<00:02, 157.03it/s, est. speed input: 9543.60 toks/s, output: 4402.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:43<00:02, 147.26it/s, est. speed input: 9714.11 toks/s, output: 4517.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:43<00:02, 157.49it/s, est. speed input: 9903.91 toks/s, output: 4653.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:43<00:02, 143.20it/s, est. speed input: 10072.30 toks/s, output: 4771.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:43<00:02, 149.09it/s, est. speed input: 10260.64 toks/s, output: 4864.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:44<00:01, 159.03it/s, est. speed input: 10444.48 toks/s, output: 5001.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:44<00:01, 178.98it/s, est. speed input: 10690.88 toks/s, output: 5146.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:44<00:01, 209.58it/s, est. speed input: 10983.78 toks/s, output: 5330.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:44<00:01, 163.78it/s, est. speed input: 11181.87 toks/s, output: 5456.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:44<00:01, 168.03it/s, est. speed input: 11370.09 toks/s, output: 5587.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:44<00:01, 160.54it/s, est. speed input: 11545.26 toks/s, output: 5709.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:44<00:01, 132.24it/s, est. speed input: 11699.89 toks/s, output: 5837.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:45<00:01, 119.98it/s, est. speed input: 11810.21 toks/s, output: 5938.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:45<00:01, 110.82it/s, est. speed input: 11919.15 toks/s, output: 6038.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:45<00:01, 75.35it/s, est. speed input: 11969.70 toks/s, output: 6148.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:45<00:01, 69.26it/s, est. speed input: 12015.71 toks/s, output: 6197.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:46<00:00, 93.74it/s, est. speed input: 12233.98 toks/s, output: 6393.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:46<00:00, 69.88it/s, est. speed input: 12284.18 toks/s, output: 6457.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:46<00:00, 58.43it/s, est. speed input: 12314.20 toks/s, output: 6497.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:47<00:00, 43.15it/s, est. speed input: 12308.92 toks/s, output: 6529.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:47<00:00, 28.56it/s, est. speed input: 12222.76 toks/s, output: 6528.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:48<00:00, 26.16it/s, est. speed input: 12195.54 toks/s, output: 6541.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:49<00:00, 12.46it/s, est. speed input: 11849.86 toks/s, output: 6417.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:01<00:00, 12.46it/s, est. speed input: 11849.86 toks/s, output: 6417.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:11<00:00, 1.23it/s, est. speed input: 8319.49 toks/s, output: 4558.04 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:11<00:00, 17.94it/s, est. speed input: 8319.49 toks/s, output: 4558.04 toks/s]
[36m(Runner pid=3309020)[0m adv: 0.164
[36m(Runner pid=3309020)[0m gen: 99.415
[36m(Runner pid=3309020)[0m old: 87.117
[36m(Runner pid=3309020)[0m ref: 87.83
[36m(Runner pid=3309020)[0m reward: 6.138
[36m(Runner pid=3309020)[0m step: 843.655
[36m(Runner pid=3309020)[0m update_actor: 562.307
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 17; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:30:10 [executor_base.py:219] It took 0.341577 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.96 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.53 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:31:34 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:30:10 [executor_base.py:219] It took 0.340129 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:31:34 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:31:34 [executor_base.py:208] It took 0.327625 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:32:00 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:32:01 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:32:01 [executor_base.py:208] It took 0.326336 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002505202137399465, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004340621526353061}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.20952656865119934, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00021641436615027487, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.41021475195884705, 'actor/pg_clipfrac': 0.0013157895300537348, 'actor/ppo_kl': 0.000749424856621772}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0001502178783994168, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.637060821056366, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.25807011127471924, 'actor/pg_clipfrac': 0.0018518518190830946, 'actor/ppo_kl': 0.0021727951243519783}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.17134997248649597, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015238110208883882}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.26412954926490784, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0019577350467443466}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.5025642514228821, 'actor/pg_clipfrac': 0.0009416195680387318, 'actor/ppo_kl': -0.0015432210639119148}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.44743096828460693, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2757151126861572, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.010795357637107372, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.34802916646003723, 'actor/pg_clipfrac': 0.004830917809158564, 'actor/ppo_kl': 0.00011587527114897966}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6221267580986023, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.13256244361400604, 'actor/pg_clipfrac': 0.0015649452107027173, 'actor/ppo_kl': -0.0009108649101108313}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.48387351632118225, 'actor/pg_clipfrac': 0.003597122384235263, 'actor/ppo_kl': -0.0005104398587718606}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.10241236537694931, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001095559448003769}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.24266110360622406, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004026574897579849}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.15202458202838898, 'actor/pg_clipfrac': 0.002721088472753763, 'actor/ppo_kl': 0.0003716371429618448}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.3340413272380829, 'actor/pg_clipfrac': 0.0019723866134881973, 'actor/ppo_kl': -0.0012777411611750722}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.12156247347593307, 'actor/pg_clipfrac': 0.0012345679569989443, 'actor/ppo_kl': -0.001575550064444542}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0001820936449803412, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008278329623863101}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.15676556527614594, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006711783353239298}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.934912383556366, 'actor/pg_clipfrac': 0.0011641443707048893, 'actor/ppo_kl': 0.0005131456418894231}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.2715010643005371, 'actor/pg_clipfrac': 0.0007598784286528826, 'actor/ppo_kl': -0.002292596735060215}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00013765261974185705, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00014290980470832437}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.51185542345047, 'actor/pg_clipfrac': 0.002204261487349868, 'actor/ppo_kl': 0.0007899534539319575}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00010899093467742205, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00048768019769340754}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.08100910484790802, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015251412987709045}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.047466930001974106, 'actor/pg_clipfrac': 0.0015174506697803736, 'actor/ppo_kl': 0.0020171962678432465}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3276177942752838, 'actor/pg_clipfrac': 0.0013140604132786393, 'actor/ppo_kl': 0.0010678119724616408}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.15915273129940033, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007120692753233016}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002045371220447123, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002724645601119846}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00015540453023277223, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012998932506889105}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00022253546922001988, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002721781493164599}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00018059236754197627, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00048373755998909473}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.769698977470398, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006326158181764185}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4063870906829834, 'actor/pg_clipfrac': 0.0012987012742087245, 'actor/ppo_kl': 0.0005395715706981719}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.36880216002464294, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009992498671635985}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.37249353528022766, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005971967475488782}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.2503871023654938, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010173255577683449}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.45881280303001404, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012627985561266541}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.31366175413131714, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010936011094599962}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.5347057580947876, 'actor/pg_clipfrac': 0.0033444815780967474, 'actor/ppo_kl': -0.0016814330592751503}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.23417337238788605, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018172599375247955}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.4202059805393219, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -8.669008093420416e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3080834746360779, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008374279132112861}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00011354935122653842, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014653330435976386}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.6509777903556824, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013834761921316385}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3551827371120453, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002136353636160493}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.21444757282733917, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002750233979895711}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002956668904516846, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002692625857889652}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00020870982552878559, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009389203041791916}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.32161322236061096, 'actor/pg_clipfrac': 0.0014577260008081794, 'actor/ppo_kl': -0.0012736862991005182}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.13910655677318573, 'actor/pg_clipfrac': 0.001108647440560162, 'actor/ppo_kl': -0.0005147071206010878}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.23170040547847748, 'actor/pg_clipfrac': 0.0009157509193755686, 'actor/ppo_kl': 0.0011482849949970841}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.09658768028020859, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.9046107051253784e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2815781831741333, 'actor/pg_clipfrac': 0.0011098779505118728, 'actor/ppo_kl': -0.0006471056258305907}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.18541519343852997, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012372530763968825}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.11517533659934998, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007905771490186453}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.39431869983673096, 'actor/pg_clipfrac': 0.0022148394491523504, 'actor/ppo_kl': -8.495833753840998e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.09878867864608765, 'actor/pg_clipfrac': 0.003710575168952346, 'actor/ppo_kl': 0.0022825247142463923}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.2242807298898697, 'actor/pg_clipfrac': 0.0027173913549631834, 'actor/ppo_kl': -0.0004674818192142993}
[36m(Runner pid=3309020)[0m Step 17
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.268
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.017
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.009
[36m(Runner pid=3309020)[0m ppo_kl: 7.863501371474513e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.016
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.016
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.626
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.626
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 911743
[36m(Runner pid=3309020)[0m balanced_min: 911743
[36m(Runner pid=3309020)[0m max: 915597
[36m(Runner pid=3309020)[0m mean: 911743.0
[36m(Runner pid=3309020)[0m min: 907889
[36m(Runner pid=3309020)[0m minmax_diff: 7708
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 110.158
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.112
[36m(Runner pid=3309020)[0m throughput: 1038.351
[36m(Runner pid=3309020)[0m time_per_step: 878.068
[36m(Runner pid=3309020)[0m total_num_tokens: 1823486
[36m(Runner pid=3309020)[0m prompt_length:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:20<1:25:38, 4.03s/it, est. speed input: 109.17 toks/s, output: 22.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:21<38:42, 1.83s/it, est. speed input: 209.60 toks/s, output: 43.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:22<22:12, 1.05s/it, est. speed input: 305.07 toks/s, output: 64.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:23<15:18, 1.37it/s, est. speed input: 379.01 toks/s, output: 82.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<16:48, 1.24it/s, est. speed input: 399.09 toks/s, output: 86.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:28<08:26, 2.46it/s, est. speed input: 553.90 toks/s, output: 128.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:29<07:02, 2.94it/s, est. speed input: 617.73 toks/s, output: 148.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:29<05:19, 3.87it/s, est. speed input: 689.82 toks/s, output: 164.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:29<04:33, 4.50it/s, est. speed input: 748.06 toks/s, output: 182.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:30<03:28, 5.87it/s, est. speed input: 823.04 toks/s, output: 202.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:31<04:22, 4.64it/s, est. speed input: 857.79 toks/s, output: 212.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:31<03:15, 6.21it/s, est. speed input: 924.69 toks/s, output: 234.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:32<01:20, 14.89it/s, est. speed input: 1203.23 toks/s, output: 318.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:32<01:11, 16.57it/s, est. speed input: 1276.91 toks/s, output: 340.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:33<01:21, 14.42it/s, est. speed input: 1436.90 toks/s, output: 387.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:33<01:18, 14.92it/s, est. speed input: 1493.66 toks/s, output: 407.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:33<00:48, 23.57it/s, est. speed input: 1692.93 toks/s, output: 465.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:34<00:39, 29.25it/s, est. speed input: 1824.12 toks/s, output: 508.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:34<00:29, 37.93it/s, est. speed input: 2010.96 toks/s, output: 574.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:34<00:27, 40.47it/s, est. speed input: 2128.26 toks/s, output: 620.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:34<00:23, 46.35it/s, est. speed input: 2251.20 toks/s, output: 666.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:34<00:25, 43.65it/s, est. speed input: 2363.25 toks/s, output: 699.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:35<00:27, 39.37it/s, est. speed input: 2468.79 toks/s, output: 739.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:35<00:42, 25.39it/s, est. speed input: 2491.85 toks/s, output: 749.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:35<00:39, 27.56it/s, est. speed input: 2545.32 toks/s, output: 775.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:35<00:21, 49.41it/s, est. speed input: 2794.97 toks/s, output: 870.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:36<00:26, 39.03it/s, est. speed input: 2890.26 toks/s, output: 908.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:36<00:24, 43.18it/s, est. speed input: 3000.37 toks/s, output: 936.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:36<00:20, 50.90it/s, est. speed input: 3114.25 toks/s, output: 979.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:36<00:18, 53.90it/s, est. speed input: 3223.36 toks/s, output: 1021.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:37<00:26, 37.71it/s, est. speed input: 3305.93 toks/s, output: 1057.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:37<00:16, 59.82it/s, est. speed input: 3550.00 toks/s, output: 1158.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:37<00:17, 57.33it/s, est. speed input: 3650.95 toks/s, output: 1200.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:37<00:16, 59.85it/s, est. speed input: 3757.62 toks/s, output: 1245.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:37<00:16, 57.63it/s, est. speed input: 3857.05 toks/s, output: 1289.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:38<00:13, 69.11it/s, est. speed input: 4021.25 toks/s, output: 1363.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:38<00:15, 59.49it/s, est. speed input: 4113.48 toks/s, output: 1404.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:38<00:13, 66.49it/s, est. speed input: 4270.16 toks/s, output: 1483.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:38<00:14, 64.18it/s, est. speed input: 4372.99 toks/s, output: 1537.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:38<00:13, 67.48it/s, est. speed input: 4478.39 toks/s, output: 1590.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:38<00:13, 64.96it/s, est. speed input: 4580.61 toks/s, output: 1632.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:39<00:22, 38.64it/s, est. speed input: 4641.55 toks/s, output: 1662.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:39<00:15, 55.96it/s, est. speed input: 4860.95 toks/s, output: 1769.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:39<00:13, 61.13it/s, est. speed input: 5019.17 toks/s, output: 1847.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:39<00:09, 88.52it/s, est. speed input: 5288.41 toks/s, output: 1984.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:40<00:09, 86.55it/s, est. speed input: 5436.76 toks/s, output: 2071.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:40<00:09, 86.79it/s, est. speed input: 5585.42 toks/s, output: 2133.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:40<00:07, 107.84it/s, est. speed input: 5801.81 toks/s, output: 2224.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:40<00:07, 106.19it/s, est. speed input: 6009.58 toks/s, output: 2336.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:40<00:07, 92.80it/s, est. speed input: 6145.75 toks/s, output: 2408.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:40<00:07, 99.67it/s, est. speed input: 6297.27 toks/s, output: 2488.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:41<00:07, 99.82it/s, est. speed input: 6447.49 toks/s, output: 2554.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:41<00:05, 116.62it/s, est. speed input: 6661.04 toks/s, output: 2646.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:41<00:06, 106.58it/s, est. speed input: 6805.57 toks/s, output: 2723.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:41<00:05, 112.42it/s, est. speed input: 6956.44 toks/s, output: 2809.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:41<00:04, 139.58it/s, est. speed input: 7221.05 toks/s, output: 2947.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:41<00:03, 168.14it/s, est. speed input: 7526.00 toks/s, output: 3095.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:41<00:03, 159.03it/s, est. speed input: 7722.22 toks/s, output: 3179.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:41<00:02, 184.82it/s, est. speed input: 8033.13 toks/s, output: 3353.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:42<00:02, 205.76it/s, est. speed input: 8348.66 toks/s, output: 3497.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:42<00:02, 174.65it/s, est. speed input: 8577.04 toks/s, output: 3628.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:42<00:03, 140.67it/s, est. speed input: 8748.52 toks/s, output: 3751.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:42<00:03, 127.64it/s, est. speed input: 8923.26 toks/s, output: 3836.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:42<00:03, 136.62it/s, est. speed input: 9131.89 toks/s, output: 3941.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:42<00:03, 129.28it/s, est. speed input: 9260.44 toks/s, output: 4015.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:43<00:02, 161.13it/s, est. speed input: 9558.73 toks/s, output: 4184.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:43<00:03, 110.69it/s, est. speed input: 9705.15 toks/s, output: 4280.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:43<00:03, 113.21it/s, est. speed input: 9840.36 toks/s, output: 4381.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:43<00:02, 120.02it/s, est. speed input: 9972.28 toks/s, output: 4448.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:43<00:02, 123.93it/s, est. speed input: 10106.18 toks/s, output: 4541.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:43<00:02, 139.48it/s, est. speed input: 10298.59 toks/s, output: 4663.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:43<00:01, 156.88it/s, est. speed input: 10589.26 toks/s, output: 4885.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:44<00:01, 173.12it/s, est. speed input: 10836.52 toks/s, output: 5063.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:44<00:01, 170.60it/s, est. speed input: 11108.03 toks/s, output: 5231.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:44<00:01, 159.56it/s, est. speed input: 11332.41 toks/s, output: 5400.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:44<00:01, 116.35it/s, est. speed input: 11470.59 toks/s, output: 5532.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:44<00:01, 121.76it/s, est. speed input: 11604.76 toks/s, output: 5647.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:45<00:00, 146.68it/s, est. speed input: 11880.80 toks/s, output: 5853.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:45<00:00, 118.79it/s, est. speed input: 12033.46 toks/s, output: 5964.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:45<00:00, 110.89it/s, est. speed input: 12147.62 toks/s, output: 6057.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:45<00:01, 72.94it/s, est. speed input: 12190.56 toks/s, output: 6129.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:46<00:00, 72.56it/s, est. speed input: 12259.16 toks/s, output: 6202.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:46<00:00, 72.53it/s, est. speed input: 12321.51 toks/s, output: 6283.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:46<00:00, 76.89it/s, est. speed input: 12430.91 toks/s, output: 6369.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:46<00:00, 54.80it/s, est. speed input: 12438.07 toks/s, output: 6407.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:46<00:00, 47.64it/s, est. speed input: 12456.75 toks/s, output: 6471.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:47<00:00, 47.29it/s, est. speed input: 12502.24 toks/s, output: 6539.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:47<00:00, 28.66it/s, est. speed input: 12412.65 toks/s, output: 6536.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:50<00:00, 8.08it/s, est. speed input: 11744.45 toks/s, output: 6209.81 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:50<00:00, 25.19it/s, est. speed input: 11744.45 toks/s, output: 6209.81 toks/s]
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 466.223
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1191.0
[36m(Runner pid=3309020)[0m mean: 246.077
[36m(Runner pid=3309020)[0m min: 47.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.256
[36m(Runner pid=3309020)[0m format: 0.994
[36m(Runner pid=3309020)[0m overall: 0.626
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.202
[36m(Runner pid=3309020)[0m old: 0.049
[36m(Runner pid=3309020)[0m ref: 0.05
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.309
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.2
[36m(Runner pid=3309020)[0m gen: 127.299
[36m(Runner pid=3309020)[0m old: 88.974
[36m(Runner pid=3309020)[0m ref: 91.547
[36m(Runner pid=3309020)[0m reward: 6.181
[36m(Runner pid=3309020)[0m step: 878.068
[36m(Runner pid=3309020)[0m update_actor: 563.211
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 18; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:44:50 [executor_base.py:219] It took 0.341380 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:46:12 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:44:50 [executor_base.py:219] It took 0.341473 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:46:13 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:46:13 [executor_base.py:208] It took 0.327966 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:46:13 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:46:14 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:46:14 [executor_base.py:208] It took 0.326070 seconds to fall asleep.
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00018830937915481627, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00015505991177633405, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00017445556295569986, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011591784423217177}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00018484992324374616, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0003375467495061457, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00024691407452337444, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002078294288367033}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.17779631912708282, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.21961280703544617, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.08588898181915283, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00025531454593874514, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00015543332847300917, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.12325180321931839, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005930784973315895}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.46235722303390503, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.4904957711696625, 'actor/pg_clipfrac': 0.004444444552063942, 'actor/ppo_kl': -4.490604987950064e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3509005904197693, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001018543029204011}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00013947409752290696, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005107524921186268}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.49222275614738464, 'actor/pg_clipfrac': 0.002060439670458436, 'actor/ppo_kl': 0.0007774371188133955}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.3660934269428253, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010114704491570592}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.08471644669771194, 'actor/pg_clipfrac': 0.0021715527400374413, 'actor/ppo_kl': -4.2334442696301267e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0001810649409890175, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006538219749927521}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.5594836473464966, 'actor/pg_clipfrac': 0.0024135157000273466, 'actor/ppo_kl': -0.00019103869271930307}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.08521097153425217, 'actor/pg_clipfrac': 0.0015797788510099053, 'actor/ppo_kl': -0.00010187592124566436}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.5503352284431458, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010375494603067636}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00011615575931500643, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018100531306117773}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0001877604372566566, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00026517052901908755}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.030510278418660164, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00030435025109909475}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.5065891146659851, 'actor/pg_clipfrac': 0.0009823183063417673, 'actor/ppo_kl': -0.0004930084105581045}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0001358043955406174, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007415928412228823}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.20055165886878967, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0019342144951224327}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.20382940769195557, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010789947118610144}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.14676137268543243, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004806172801181674}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.17250818014144897, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018882511649280787}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.5775193572044373, 'actor/pg_clipfrac': 0.0023041474632918835, 'actor/ppo_kl': -0.000525076815392822}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00015436738613061607, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006304350099526346}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.12858806550502777, 'actor/pg_clipfrac': 0.0008944543660618365, 'actor/ppo_kl': 0.0005302821518853307}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00022757082479074597, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00023535065702162683}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.22961781919002533, 'actor/pg_clipfrac': 0.0024038462433964014, 'actor/ppo_kl': 0.00025985180400311947}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.3315483033657074, 'actor/pg_clipfrac': 0.0030303029343485832, 'actor/ppo_kl': -0.00019081866776105016}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.08170720189809799, 'actor/pg_clipfrac': 0.001291989698074758, 'actor/ppo_kl': -0.0008078902610577643}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.015387874096632004, 'actor/pg_clipfrac': 0.0017182130832225084, 'actor/ppo_kl': 0.0018310678424313664}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2432735562324524, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002971974608954042}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.11379750818014145, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010985019616782665}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.22383686900138855, 'actor/pg_clipfrac': 0.0015174506697803736, 'actor/ppo_kl': 0.0014143033185973763}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.39267903566360474, 'actor/pg_clipfrac': 0.0033975085243582726, 'actor/ppo_kl': 0.000513420207425952}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.36957719922065735, 'actor/pg_clipfrac': 0.00200803205370903, 'actor/ppo_kl': 0.0014598437119275331}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.47607386112213135, 'actor/pg_clipfrac': 0.002657218836247921, 'actor/ppo_kl': -0.002825058763846755}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.26713064312934875, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00030921580037102103}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.49787047505378723, 'actor/pg_clipfrac': 0.0015772870974615216, 'actor/ppo_kl': 0.00027608571690507233}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.3231760263442993, 'actor/pg_clipfrac': 0.0012690355069935322, 'actor/ppo_kl': 0.001396268722601235}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.19413742423057556, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015839929692447186}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00017968041356652975, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018027350306510925}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00018737079517450184, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013345991028472781}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -1.0810790061950684, 'actor/pg_clipfrac': 0.0028544242959469557, 'actor/ppo_kl': -0.00022658248781226575}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00026819074992090464, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0021415490191429853}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.10349580645561218, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002650693291798234}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3557880222797394, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00037830753717571497}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.19657756388187408, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004693031369242817}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.2171778678894043, 'actor/pg_clipfrac': 0.0020790020935237408, 'actor/ppo_kl': -0.0006965483771637082}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5308967232704163, 'actor/pg_clipfrac': 0.0021645021624863148, 'actor/ppo_kl': 4.990482921130024e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.33252206444740295, 'actor/pg_clipfrac': 0.0011389522114768624, 'actor/ppo_kl': 0.0018689300632104278}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00020198291167616844, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007800071034580469}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0001468958507757634, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010533559834584594}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.0206321869045496, 'actor/pg_clipfrac': 0.0007309941574931145, 'actor/ppo_kl': 0.0009123512427322567}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.27442070841789246, 'actor/pg_clipfrac': 0.0009425070602446795, 'actor/ppo_kl': 0.00011280681792413816}
[36m(Runner pid=3309020)[0m Step 18
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.277
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.023
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.004
[36m(Runner pid=3309020)[0m ppo_kl: 4.111909561781424e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.012
[36m(Runner pid=3309020)[0m min: -1.789
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:20<1:25:24, 4.02s/it, est. speed input: 114.21 toks/s, output: 23.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:24<46:55, 2.22s/it, est. speed input: 186.77 toks/s, output: 41.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:27<30:01, 1.42s/it, est. speed input: 257.17 toks/s, output: 59.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<18:27, 1.14it/s, est. speed input: 333.41 toks/s, output: 82.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:27<12:19, 1.70it/s, est. speed input: 412.19 toks/s, output: 104.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:28<06:59, 2.97it/s, est. speed input: 556.61 toks/s, output: 138.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:28<05:17, 3.91it/s, est. speed input: 625.96 toks/s, output: 158.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:30<05:16, 3.90it/s, est. speed input: 673.48 toks/s, output: 171.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:30<03:59, 5.13it/s, est. speed input: 744.66 toks/s, output: 194.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:30<03:01, 6.73it/s, est. speed input: 812.52 toks/s, output: 215.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:31<03:19, 6.10it/s, est. speed input: 859.60 toks/s, output: 231.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:31<02:37, 7.73it/s, est. speed input: 923.52 toks/s, output: 249.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:32<02:55, 6.90it/s, est. speed input: 968.36 toks/s, output: 267.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:32<02:13, 9.02it/s, est. speed input: 1034.47 toks/s, output: 285.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:33<01:54, 10.48it/s, est. speed input: 1090.79 toks/s, output: 301.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:21, 14.67it/s, est. speed input: 1212.43 toks/s, output: 338.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:33<01:09, 17.15it/s, est. speed input: 1278.42 toks/s, output: 359.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:33<00:59, 19.82it/s, est. speed input: 1335.01 toks/s, output: 377.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:34<01:20, 14.68it/s, est. speed input: 1377.57 toks/s, output: 394.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:34<01:06, 17.55it/s, est. speed input: 1439.78 toks/s, output: 418.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:34<00:43, 26.37it/s, est. speed input: 1570.00 toks/s, output: 463.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:34<00:32, 34.95it/s, est. speed input: 1702.95 toks/s, output: 511.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:35<00:29, 38.95it/s, est. speed input: 1825.48 toks/s, output: 547.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:35<00:26, 41.87it/s, est. speed input: 1949.15 toks/s, output: 588.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:35<00:22, 48.78it/s, est. speed input: 2070.19 toks/s, output: 630.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:35<00:22, 49.13it/s, est. speed input: 2183.06 toks/s, output: 676.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:35<00:22, 49.40it/s, est. speed input: 2297.61 toks/s, output: 726.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:36<00:26, 41.50it/s, est. speed input: 2400.31 toks/s, output: 762.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:36<00:22, 48.50it/s, est. speed input: 2518.01 toks/s, output: 805.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:36<00:32, 32.48it/s, est. speed input: 2604.17 toks/s, output: 836.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:36<00:26, 39.85it/s, est. speed input: 2720.40 toks/s, output: 881.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:37<00:22, 47.35it/s, est. speed input: 2828.77 toks/s, output: 923.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:37<00:24, 42.55it/s, est. speed input: 2933.82 toks/s, output: 960.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:37<00:20, 50.16it/s, est. speed input: 3045.89 toks/s, output: 1010.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:37<00:24, 41.30it/s, est. speed input: 3144.40 toks/s, output: 1055.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:37<00:22, 45.56it/s, est. speed input: 3247.77 toks/s, output: 1099.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:38<00:21, 45.49it/s, est. speed input: 3352.33 toks/s, output: 1148.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:38<00:17, 57.15it/s, est. speed input: 3520.20 toks/s, output: 1217.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:38<00:14, 66.84it/s, est. speed input: 3679.44 toks/s, output: 1281.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:38<00:14, 66.34it/s, est. speed input: 3781.69 toks/s, output: 1315.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:39<00:18, 51.88it/s, est. speed input: 3922.76 toks/s, output: 1369.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:39<00:19, 48.34it/s, est. speed input: 4018.86 toks/s, output: 1403.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:39<00:18, 49.08it/s, est. speed input: 4116.20 toks/s, output: 1439.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:39<00:15, 60.55it/s, est. speed input: 4281.33 toks/s, output: 1513.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:39<00:17, 51.32it/s, est. speed input: 4411.36 toks/s, output: 1571.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:40<00:16, 52.64it/s, est. speed input: 4504.82 toks/s, output: 1621.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:40<00:13, 64.74it/s, est. speed input: 4665.98 toks/s, output: 1691.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:40<00:13, 62.64it/s, est. speed input: 4803.56 toks/s, output: 1750.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:40<00:11, 73.76it/s, est. speed input: 4956.15 toks/s, output: 1829.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:40<00:11, 75.14it/s, est. speed input: 5055.07 toks/s, output: 1880.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:41<00:09, 85.79it/s, est. speed input: 5296.82 toks/s, output: 2000.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:41<00:09, 80.40it/s, est. speed input: 5489.90 toks/s, output: 2115.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:41<00:09, 81.11it/s, est. speed input: 5582.02 toks/s, output: 2175.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:41<00:09, 82.32it/s, est. speed input: 5677.27 toks/s, output: 2221.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:41<00:09, 83.74it/s, est. speed input: 5773.98 toks/s, output: 2263.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:41<00:10, 70.75it/s, est. speed input: 5852.60 toks/s, output: 2312.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:42<00:10, 71.54it/s, est. speed input: 5940.49 toks/s, output: 2361.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:42<00:09, 77.45it/s, est. speed input: 6034.75 toks/s, output: 2417.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:42<00:06, 109.31it/s, est. speed input: 6304.43 toks/s, output: 2553.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:42<00:07, 91.70it/s, est. speed input: 6428.59 toks/s, output: 2614.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:42<00:07, 83.80it/s, est. speed input: 6563.11 toks/s, output: 2675.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:42<00:06, 96.34it/s, est. speed input: 6752.92 toks/s, output: 2790.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:43<00:06, 101.08it/s, est. speed input: 6956.70 toks/s, output: 2895.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:43<00:05, 105.19it/s, est. speed input: 7165.61 toks/s, output: 2996.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:43<00:06, 95.17it/s, est. speed input: 7296.03 toks/s, output: 3062.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:43<00:07, 80.77it/s, est. speed input: 7452.88 toks/s, output: 3143.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:43<00:05, 101.12it/s, est. speed input: 7689.85 toks/s, output: 3274.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:44<00:04, 110.86it/s, est. speed input: 7927.38 toks/s, output: 3400.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:44<00:04, 115.98it/s, est. speed input: 8101.55 toks/s, output: 3506.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:44<00:04, 120.36it/s, est. speed input: 8241.63 toks/s, output: 3591.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:44<00:03, 120.59it/s, est. speed input: 8382.22 toks/s, output: 3690.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:44<00:04, 108.07it/s, est. speed input: 8501.11 toks/s, output: 3754.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:44<00:04, 109.44it/s, est. speed input: 8701.32 toks/s, output: 3891.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:44<00:03, 128.11it/s, est. speed input: 8943.48 toks/s, output: 4049.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:45<00:03, 127.54it/s, est. speed input: 9072.88 toks/s, output: 4139.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:45<00:03, 103.17it/s, est. speed input: 9183.42 toks/s, output: 4242.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:45<00:03, 98.40it/s, est. speed input: 9298.60 toks/s, output: 4330.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:45<00:03, 94.95it/s, est. speed input: 9420.87 toks/s, output: 4433.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:45<00:03, 99.81it/s, est. speed input: 9551.54 toks/s, output: 4510.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:45<00:03, 105.98it/s, est. speed input: 9710.96 toks/s, output: 4644.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:46<00:02, 134.11it/s, est. speed input: 9935.91 toks/s, output: 4815.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:46<00:02, 120.99it/s, est. speed input: 10056.28 toks/s, output: 4895.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:46<00:01, 136.24it/s, est. speed input: 10279.58 toks/s, output: 5053.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:46<00:01, 153.63it/s, est. speed input: 10496.60 toks/s, output: 5259.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:46<00:01, 155.87it/s, est. speed input: 10667.80 toks/s, output: 5414.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:46<00:01, 139.61it/s, est. speed input: 10833.34 toks/s, output: 5577.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:46<00:01, 159.27it/s, est. speed input: 11082.61 toks/s, output: 5754.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:47<00:01, 134.98it/s, est. speed input: 11231.96 toks/s, output: 5889.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:47<00:00, 153.26it/s, est. speed input: 11447.19 toks/s, output: 6062.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:47<00:00, 127.96it/s, est. speed input: 11586.08 toks/s, output: 6168.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:47<00:00, 100.08it/s, est. speed input: 11668.30 toks/s, output: 6244.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:47<00:00, 116.36it/s, est. speed input: 11838.42 toks/s, output: 6420.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:48<00:00, 72.26it/s, est. speed input: 11872.18 toks/s, output: 6471.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:48<00:00, 61.84it/s, est. speed input: 11921.20 toks/s, output: 6586.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:48<00:00, 54.61it/s, est. speed input: 11949.34 toks/s, output: 6650.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:49<00:00, 35.59it/s, est. speed input: 11887.92 toks/s, output: 6658.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:50<00:00, 19.77it/s, est. speed input: 11690.42 toks/s, output: 6614.11 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:50<00:00, 25.23it/s, est. speed input: 11690.42 toks/s, output: 6614.11 toks/s]
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.012
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.632
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.632
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 916754
[36m(Runner pid=3309020)[0m balanced_min: 916753
[36m(Runner pid=3309020)[0m max: 921091
[36m(Runner pid=3309020)[0m mean: 916753.5
[36m(Runner pid=3309020)[0m min: 912416
[36m(Runner pid=3309020)[0m minmax_diff: 8675
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 110.484
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.113
[36m(Runner pid=3309020)[0m throughput: 1084.728
[36m(Runner pid=3309020)[0m time_per_step: 845.146
[36m(Runner pid=3309020)[0m total_num_tokens: 1833507
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 650.0
[36m(Runner pid=3309020)[0m mean: 466.049
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1333.0
[36m(Runner pid=3309020)[0m mean: 250.165
[36m(Runner pid=3309020)[0m min: 12.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.269
[36m(Runner pid=3309020)[0m format: 0.995
[36m(Runner pid=3309020)[0m overall: 0.632
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.23908144434708e-05
[36m(Runner pid=3309020)[0m gen: 0.152
[36m(Runner pid=3309020)[0m old: 0.048
[36m(Runner pid=3309020)[0m ref: 0.049
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.307
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.151
[36m(Runner pid=3309020)[0m gen: 97.383
[36m(Runner pid=3309020)[0m old: 87.716
[36m(Runner pid=3309020)[0m ref: 90.445
[36m(Runner pid=3309020)[0m reward: 6.046
[36m(Runner pid=3309020)[0m step: 845.146
[36m(Runner pid=3309020)[0m update_actor: 562.782
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 19; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.70 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 01:58:59 [executor_base.py:219] It took 0.347295 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:00:22 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 01:58:59 [executor_base.py:219] It took 0.343846 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:00:22 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:00:22 [executor_base.py:208] It took 0.363707 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:00:22 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:00:22 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:00:22 [executor_base.py:208] It took 0.326923 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.2638080418109894, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002037361409747973, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001760777784511447}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0358751155436039, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.13673534989356995, 'actor/pg_clipfrac': 0.0005534034571610391, 'actor/ppo_kl': 0.00040585361421108246}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00015826852177269757, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.13748131692409515, 'actor/pg_clipfrac': 0.0022831049282103777, 'actor/ppo_kl': -0.0029157656244933605}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.16446903347969055, 'actor/pg_clipfrac': 0.001958224456757307, 'actor/ppo_kl': -0.0007592529873363674}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00018542401085142046, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.34454429149627686, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.1833125203847885, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006484233890660107}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.27011191844940186, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00012154076830483973, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015493915416300297}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.7169370055198669, 'actor/pg_clipfrac': 0.0011627906933426857, 'actor/ppo_kl': 0.00032743409974500537}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.39644983410835266, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.31221818923950195, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010791630484163761}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.14656496047973633, 'actor/pg_clipfrac': 0.001408450654707849, 'actor/ppo_kl': 0.000563708832487464}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00024255602329503745, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000150466468767263}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.692270815372467, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009519034065306187}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00014854801702313125, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005476537626236677}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.2649725377559662, 'actor/pg_clipfrac': 0.002844950184226036, 'actor/ppo_kl': 0.002170063555240631}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.1157039999961853, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004442621429916471}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.14009462296962738, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -7.250195631058887e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.20736677944660187, 'actor/pg_clipfrac': 0.0012642225483432412, 'actor/ppo_kl': 0.0010253529762849212}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.6696863174438477, 'actor/pg_clipfrac': 0.00119331746827811, 'actor/ppo_kl': -1.1919791177206207e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.08212623745203018, 'actor/pg_clipfrac': 0.0027372261974960566, 'actor/ppo_kl': -0.0009639089112170041}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00026137850363738835, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005355099565349519}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00017561810091137886, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006114947609603405}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2514229714870453, 'actor/pg_clipfrac': 0.0012422360014170408, 'actor/ppo_kl': 0.00012895809777546674}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.38334745168685913, 'actor/pg_clipfrac': 0.0007710100035183132, 'actor/ppo_kl': 0.00017975547234527767}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.09516742080450058, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0030824511777609587}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.09239503741264343, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00043723281123675406}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.28457969427108765, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006872958620078862}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00020139564003329724, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011532935313880444}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.44777712225914, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00021646714594680816}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.416220486164093, 'actor/pg_clipfrac': 0.005807200912386179, 'actor/ppo_kl': -0.001922018127515912}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.26984837651252747, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001718740095384419}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5874449014663696, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006471400847658515}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00020652114471886307, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016207922017201781}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00021631852723658085, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00037047694786451757}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.031073041260242462, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004661312559619546}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.23745425045490265, 'actor/pg_clipfrac': 0.0006402048747986555, 'actor/ppo_kl': 0.001290815882384777}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002040718827629462, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003110281832050532}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.25948408246040344, 'actor/pg_clipfrac': 0.002352941082790494, 'actor/ppo_kl': 0.0015824426664039493}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.017030887305736542, 'actor/pg_clipfrac': 0.003142183879390359, 'actor/ppo_kl': 0.0022212460171431303}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.20002229511737823, 'actor/pg_clipfrac': 0.00147058826405555, 'actor/ppo_kl': 0.00016483980289194733}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.23954376578330994, 'actor/pg_clipfrac': 0.0007716049440205097, 'actor/ppo_kl': -0.000515375635586679}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2538567781448364, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00048618190339766443}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00013502337969839573, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005143388407304883}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.09069934487342834, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002249122626380995}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.26978498697280884, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006747174193151295}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.4544243812561035, 'actor/pg_clipfrac': 0.004615384619683027, 'actor/ppo_kl': -0.0018082897877320647}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.1642119139432907, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00143020274117589}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002791321894619614, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004573822079692036}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00024557614233344793, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00016446431982330978}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00017596749239601195, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010447336826473475}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1464429795742035, 'actor/pg_clipfrac': 0.0009090909152291715, 'actor/ppo_kl': -0.0012210308341309428}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0002261526824440807, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0020379077177494764}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -6.158494215924293e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000704927952028811}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.30427634716033936, 'actor/pg_clipfrac': 0.001179245300590992, 'actor/ppo_kl': -0.00032514220220036805}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.10038279742002487, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005117745022289455}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.16342750191688538, 'actor/pg_clipfrac': 0.002358490601181984, 'actor/ppo_kl': -0.0014045238494873047}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.19186793267726898, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00018600387556944042}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:18<1:19:32, 3.74s/it, est. speed input: 117.81 toks/s, output: 24.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:19<35:22, 1.67s/it, est. speed input: 223.48 toks/s, output: 39.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:22<23:53, 1.13s/it, est. speed input: 299.48 toks/s, output: 58.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:24<18:36, 1.13it/s, est. speed input: 365.62 toks/s, output: 75.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:27<15:51, 1.32it/s, est. speed input: 415.34 toks/s, output: 90.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:29<09:38, 2.15it/s, est. speed input: 546.56 toks/s, output: 125.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:30<08:42, 2.38it/s, est. speed input: 593.04 toks/s, output: 139.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:31<06:53, 2.99it/s, est. speed input: 655.66 toks/s, output: 155.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:31<05:22, 3.81it/s, est. speed input: 716.46 toks/s, output: 174.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:32<03:22, 6.03it/s, est. speed input: 847.01 toks/s, output: 210.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:32<02:54, 6.94it/s, est. speed input: 909.95 toks/s, output: 233.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:32<02:18, 8.71it/s, est. speed input: 971.97 toks/s, output: 251.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:32<01:27, 13.78it/s, est. speed input: 1103.89 toks/s, output: 295.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:07, 17.73it/s, est. speed input: 1233.39 toks/s, output: 330.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:33<00:49, 23.91it/s, est. speed input: 1365.78 toks/s, output: 371.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:34<01:09, 16.84it/s, est. speed input: 1467.12 toks/s, output: 403.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:34<00:57, 20.30it/s, est. speed input: 1592.51 toks/s, output: 451.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:34<00:51, 22.42it/s, est. speed input: 1710.11 toks/s, output: 490.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:35<00:44, 25.45it/s, est. speed input: 1829.20 toks/s, output: 533.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:35<00:42, 26.97it/s, est. speed input: 1882.13 toks/s, output: 547.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:35<00:45, 24.65it/s, est. speed input: 1930.57 toks/s, output: 561.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:35<00:42, 26.73it/s, est. speed input: 1988.93 toks/s, output: 584.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:35<00:24, 45.68it/s, est. speed input: 2231.26 toks/s, output: 672.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:36<00:23, 46.98it/s, est. speed input: 2350.82 toks/s, output: 709.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:36<00:28, 38.09it/s, est. speed input: 2448.69 toks/s, output: 750.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:36<00:30, 35.71it/s, est. speed input: 2500.53 toks/s, output: 774.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:36<00:24, 44.01it/s, est. speed input: 2619.13 toks/s, output: 812.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:37<00:26, 40.16it/s, est. speed input: 2722.25 toks/s, output: 841.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:37<00:39, 26.59it/s, est. speed input: 2751.63 toks/s, output: 859.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:37<00:29, 35.05it/s, est. speed input: 2865.19 toks/s, output: 904.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:37<00:23, 43.89it/s, est. speed input: 2972.19 toks/s, output: 944.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:38<00:24, 41.34it/s, est. speed input: 3074.45 toks/s, output: 990.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:38<00:26, 38.08it/s, est. speed input: 3169.91 toks/s, output: 1028.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:38<00:22, 43.73it/s, est. speed input: 3280.85 toks/s, output: 1078.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:38<00:18, 52.70it/s, est. speed input: 3394.26 toks/s, output: 1118.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:38<00:20, 47.94it/s, est. speed input: 3490.33 toks/s, output: 1144.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:38<00:17, 56.90it/s, est. speed input: 3600.13 toks/s, output: 1196.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:39<00:13, 71.51it/s, est. speed input: 3813.66 toks/s, output: 1288.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:39<00:11, 79.44it/s, est. speed input: 3992.39 toks/s, output: 1368.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:39<00:10, 87.29it/s, est. speed input: 4204.26 toks/s, output: 1457.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:39<00:16, 55.83it/s, est. speed input: 4278.38 toks/s, output: 1486.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:40<00:16, 53.33it/s, est. speed input: 4363.22 toks/s, output: 1536.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:40<00:15, 58.23it/s, est. speed input: 4509.94 toks/s, output: 1595.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:40<00:14, 61.87it/s, est. speed input: 4658.60 toks/s, output: 1660.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:40<00:12, 68.92it/s, est. speed input: 4808.78 toks/s, output: 1730.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:40<00:11, 71.75it/s, est. speed input: 4910.51 toks/s, output: 1787.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:41<00:12, 65.01it/s, est. speed input: 5002.22 toks/s, output: 1836.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:41<00:08, 96.18it/s, est. speed input: 5270.23 toks/s, output: 1965.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:41<00:10, 75.16it/s, est. speed input: 5403.30 toks/s, output: 2028.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:41<00:10, 77.26it/s, est. speed input: 5545.50 toks/s, output: 2099.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:41<00:08, 87.95it/s, est. speed input: 5747.59 toks/s, output: 2207.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:41<00:08, 90.01it/s, est. speed input: 5844.05 toks/s, output: 2251.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:42<00:09, 77.03it/s, est. speed input: 5928.99 toks/s, output: 2280.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:42<00:08, 86.27it/s, est. speed input: 6124.00 toks/s, output: 2386.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:42<00:08, 80.73it/s, est. speed input: 6263.76 toks/s, output: 2472.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:42<00:07, 89.95it/s, est. speed input: 6405.74 toks/s, output: 2555.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:42<00:07, 88.53it/s, est. speed input: 6492.60 toks/s, output: 2609.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:42<00:07, 92.67it/s, est. speed input: 6630.19 toks/s, output: 2667.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:42<00:04, 133.73it/s, est. speed input: 6930.93 toks/s, output: 2839.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:43<00:06, 103.27it/s, est. speed input: 7055.44 toks/s, output: 2919.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:43<00:05, 108.11it/s, est. speed input: 7202.80 toks/s, output: 2982.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:43<00:05, 98.76it/s, est. speed input: 7370.82 toks/s, output: 3086.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:43<00:04, 128.94it/s, est. speed input: 7685.93 toks/s, output: 3273.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:43<00:03, 150.76it/s, est. speed input: 7921.58 toks/s, output: 3431.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:43<00:03, 147.41it/s, est. speed input: 8108.00 toks/s, output: 3546.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:44<00:03, 135.76it/s, est. speed input: 8293.08 toks/s, output: 3678.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:44<00:03, 148.08it/s, est. speed input: 8524.35 toks/s, output: 3795.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:44<00:03, 147.13it/s, est. speed input: 8703.46 toks/s, output: 3905.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:44<00:03, 131.60it/s, est. speed input: 8876.14 toks/s, output: 4044.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:44<00:03, 123.17it/s, est. speed input: 9000.26 toks/s, output: 4118.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:44<00:03, 118.23it/s, est. speed input: 9127.87 toks/s, output: 4193.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:45<00:03, 105.23it/s, est. speed input: 9238.82 toks/s, output: 4277.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:45<00:03, 114.05it/s, est. speed input: 9370.05 toks/s, output: 4375.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:45<00:02, 119.31it/s, est. speed input: 9506.15 toks/s, output: 4443.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:45<00:01, 158.70it/s, est. speed input: 9869.69 toks/s, output: 4672.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:45<00:02, 127.55it/s, est. speed input: 10021.58 toks/s, output: 4769.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:45<00:01, 160.20it/s, est. speed input: 10311.27 toks/s, output: 4956.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:46<00:01, 130.85it/s, est. speed input: 10462.84 toks/s, output: 5094.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:46<00:01, 160.27it/s, est. speed input: 10735.56 toks/s, output: 5326.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:46<00:01, 122.28it/s, est. speed input: 10879.24 toks/s, output: 5450.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:46<00:01, 131.70it/s, est. speed input: 11135.09 toks/s, output: 5661.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:46<00:01, 109.12it/s, est. speed input: 11272.35 toks/s, output: 5783.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:47<00:01, 84.30it/s, est. speed input: 11344.92 toks/s, output: 5861.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:47<00:01, 91.86it/s, est. speed input: 11464.12 toks/s, output: 5961.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:47<00:01, 77.90it/s, est. speed input: 11541.93 toks/s, output: 6047.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:47<00:01, 78.06it/s, est. speed input: 11611.40 toks/s, output: 6135.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:47<00:01, 73.12it/s, est. speed input: 11663.88 toks/s, output: 6198.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:48<00:00, 73.92it/s, est. speed input: 11765.24 toks/s, output: 6299.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:48<00:00, 56.51it/s, est. speed input: 11809.23 toks/s, output: 6376.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:48<00:00, 48.63it/s, est. speed input: 11832.53 toks/s, output: 6405.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:48<00:00, 54.11it/s, est. speed input: 11897.65 toks/s, output: 6474.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:49<00:00, 40.07it/s, est. speed input: 11885.05 toks/s, output: 6522.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:52<00:00, 10.68it/s, est. speed input: 11342.08 toks/s, output: 6282.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:54<00:00, 6.99it/s, est. speed input: 10969.36 toks/s, output: 6127.87 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:54<00:00, 23.63it/s, est. speed input: 10969.36 toks/s, output: 6127.87 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.23720288276672363, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.8536912800045684e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.5348291397094727, 'actor/pg_clipfrac': 0.0026200872380286455, 'actor/ppo_kl': 0.001534714363515377}
[36m(Runner pid=3309020)[0m Step 19
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.269
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.015
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.006
[36m(Runner pid=3309020)[0m ppo_kl: 4.784921063989422e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.008
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.008
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.628
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.628
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 919523
[36m(Runner pid=3309020)[0m balanced_min: 919522
[36m(Runner pid=3309020)[0m max: 928591
[36m(Runner pid=3309020)[0m mean: 919522.5
[36m(Runner pid=3309020)[0m min: 910454
[36m(Runner pid=3309020)[0m minmax_diff: 18137
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.304
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.113
[36m(Runner pid=3309020)[0m throughput: 1101.729
[36m(Runner pid=3309020)[0m time_per_step: 834.617
[36m(Runner pid=3309020)[0m total_num_tokens: 1839045
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 814.0
[36m(Runner pid=3309020)[0m mean: 463.443
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1133.0
[36m(Runner pid=3309020)[0m mean: 254.934
[36m(Runner pid=3309020)[0m min: 45.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.261
[36m(Runner pid=3309020)[0m format: 0.994
[36m(Runner pid=3309020)[0m overall: 0.628
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.149
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.304
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.198
[36m(Runner pid=3309020)[0m gen: 97.269
[36m(Runner pid=3309020)[0m old: 83.983
[36m(Runner pid=3309020)[0m ref: 86.144
[36m(Runner pid=3309020)[0m reward: 6.6
[36m(Runner pid=3309020)[0m step: 834.617
[36m(Runner pid=3309020)[0m update_actor: 559.809
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 20; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:12:55 [executor_base.py:219] It took 0.341095 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:14:19 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:12:55 [executor_base.py:219] It took 0.339587 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:14:19 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:14:19 [executor_base.py:208] It took 0.327421 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:14:21 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:14:21 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:14:21 [executor_base.py:208] It took 0.326677 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00011447452561696991, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00018195142911281437, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.3445451259613037, 'actor/pg_clipfrac': 0.0011025358689948916, 'actor/ppo_kl': 0.001331030740402639}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.8776899576187134, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.923875621898333e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.309943825006485, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016341062728315592}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.015800699591636658, 'actor/pg_clipfrac': 0.0007385524222627282, 'actor/ppo_kl': -0.0006798402173444629}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.4177526533603668, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007291961228474975}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.16060961782932281, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.6825422644615173, 'actor/pg_clipfrac': 0.0012048193020746112, 'actor/ppo_kl': 0.00142263388261199}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002505655575077981, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.014209311455488205, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.13877329230308533, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001508814049884677}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.3060944080352783, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005878580268472433}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00016653703642077744, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3024123013019562, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.37490883469581604, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00017931881302502006, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000366554653737694}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.26988276839256287, 'actor/pg_clipfrac': 0.0010152284521609545, 'actor/ppo_kl': 0.0008979022968560457}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002406665589660406, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007066594553180039}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00012792566849384457, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004025471571367234}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.3875415027141571, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015262188389897346}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.5920811295509338, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.7824640963226557e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.5239299535751343, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007523397216573358}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1640896201133728, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000616316101513803}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.4092523157596588, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012098338920623064}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.12027672678232193, 'actor/pg_clipfrac': 0.0012499999720603228, 'actor/ppo_kl': 0.003023662604391575}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.11371161043643951, 'actor/pg_clipfrac': 0.0036549707874655724, 'actor/ppo_kl': 2.4837359887897037e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.13344375789165497, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00040667993016541004}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.26036423444747925, 'actor/pg_clipfrac': 0.0015408321050927043, 'actor/ppo_kl': -0.00041427818359807134}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0001379671593895182, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008538858965039253}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.4977426826953888, 'actor/pg_clipfrac': 0.0007087172125466168, 'actor/ppo_kl': -1.7298611055593938e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4020577073097229, 'actor/pg_clipfrac': 0.001917545567266643, 'actor/ppo_kl': 0.00022218508820515126}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.42280203104019165, 'actor/pg_clipfrac': 0.001303780940361321, 'actor/ppo_kl': 0.0003014381800312549}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.16526274383068085, 'actor/pg_clipfrac': 0.0030959751456975937, 'actor/ppo_kl': 0.0012153825955465436}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.30730146169662476, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001566654653288424}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.21908706426620483, 'actor/pg_clipfrac': 0.0024096386041492224, 'actor/ppo_kl': 0.001012258231639862}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.479281485080719, 'actor/pg_clipfrac': 0.0023781212512403727, 'actor/ppo_kl': 0.0011311416747048497}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.2010129690170288, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007304063765332103}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.31290391087532043, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003845407336484641}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00015425161109305918, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006606018869206309}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.05813904106616974, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011874467600136995}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.4678232967853546, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004572384350467473}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00011715103028109297, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 5.326183236320503e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1247251033782959, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00022575785988010466}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.5331845283508301, 'actor/pg_clipfrac': 0.0043196543119847775, 'actor/ppo_kl': -0.001088517252355814}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.9702250361442566, 'actor/pg_clipfrac': 0.0025359257124364376, 'actor/ppo_kl': -0.00019097993208561093}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0001669228804530576, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004694333183579147}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.1925346702337265, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00018789728346746415}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.06295239180326462, 'actor/pg_clipfrac': 0.0012091898825019598, 'actor/ppo_kl': 0.0007902327342890203}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.6560025811195374, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001728500210447237}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.09260229021310806, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004298968706279993}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.12841103971004486, 'actor/pg_clipfrac': 0.0012135922443121672, 'actor/ppo_kl': 0.001604342949576676}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.28858616948127747, 'actor/pg_clipfrac': 0.0030257184989750385, 'actor/ppo_kl': 0.002652449533343315}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.02608283795416355, 'actor/pg_clipfrac': 0.0019011406693607569, 'actor/ppo_kl': -0.0018079199362546206}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.16009677946567535, 'actor/pg_clipfrac': 0.0062500000931322575, 'actor/ppo_kl': -0.001606027246452868}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.1635267585515976, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.1393592255190015e-05}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:35:16, 15.20s/it, est. speed input: 29.14 toks/s, output: 5.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<39:44, 6.36s/it, est. speed input: 59.08 toks/s, output: 10.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 4/377 [00:15<15:00, 2.41s/it, est. speed input: 116.56 toks/s, output: 21.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 8/377 [00:15<05:40, 1.08it/s, est. speed input: 227.44 toks/s, output: 44.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 11/377 [00:15<03:26, 1.77it/s, est. speed input: 314.17 toks/s, output: 62.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 16/377 [00:16<01:53, 3.17it/s, est. speed input: 449.36 toks/s, output: 93.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 19/377 [00:16<01:26, 4.15it/s, est. speed input: 527.65 toks/s, output: 113.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 24/377 [00:16<00:54, 6.47it/s, est. speed input: 659.81 toks/s, output: 147.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 30/377 [00:16<00:34, 10.07it/s, est. speed input: 819.58 toks/s, output: 190.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 35/377 [00:16<00:25, 13.37it/s, est. speed input: 950.56 toks/s, output: 226.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 42/377 [00:17<00:17, 19.13it/s, est. speed input: 1132.29 toks/s, output: 277.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 46/377 [00:17<00:15, 21.27it/s, est. speed input: 1231.94 toks/s, output: 307.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 52/377 [00:17<00:12, 26.31it/s, est. speed input: 1385.40 toks/s, output: 354.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 57/377 [00:17<00:12, 26.61it/s, est. speed input: 1505.13 toks/s, output: 391.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 61/377 [00:17<00:11, 27.95it/s, est. speed input: 1604.47 toks/s, output: 421.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 67/377 [00:17<00:09, 32.95it/s, est. speed input: 1750.77 toks/s, output: 470.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 71/377 [00:17<00:09, 33.05it/s, est. speed input: 1845.83 toks/s, output: 501.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 75/377 [00:17<00:09, 33.13it/s, est. speed input: 1936.01 toks/s, output: 533.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 79/377 [00:18<00:10, 27.79it/s, est. speed input: 2014.84 toks/s, output: 565.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 87/377 [00:18<00:07, 37.45it/s, est. speed input: 2204.12 toks/s, output: 636.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 93/377 [00:18<00:06, 41.10it/s, est. speed input: 2339.54 toks/s, output: 690.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▋ | 99/377 [00:18<00:06, 44.08it/s, est. speed input: 2476.90 toks/s, output: 746.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 104/377 [00:18<00:06, 44.19it/s, est. speed input: 2589.51 toks/s, output: 791.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 110/377 [00:18<00:05, 46.64it/s, est. speed input: 2724.44 toks/s, output: 845.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 115/377 [00:18<00:05, 46.16it/s, est. speed input: 2834.46 toks/s, output: 891.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 127/377 [00:18<00:03, 65.00it/s, est. speed input: 3114.40 toks/s, output: 1007.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 136/377 [00:19<00:03, 64.47it/s, est. speed input: 3320.00 toks/s, output: 1094.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 143/377 [00:19<00:03, 63.21it/s, est. speed input: 3467.74 toks/s, output: 1161.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 150/377 [00:19<00:03, 62.32it/s, est. speed input: 3616.18 toks/s, output: 1233.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 158/377 [00:19<00:03, 64.30it/s, est. speed input: 3783.75 toks/s, output: 1314.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 169/377 [00:19<00:02, 74.45it/s, est. speed input: 4024.61 toks/s, output: 1429.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 177/377 [00:19<00:02, 74.14it/s, est. speed input: 4195.45 toks/s, output: 1512.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 185/377 [00:19<00:03, 55.78it/s, est. speed input: 4335.04 toks/s, output: 1588.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 192/377 [00:19<00:03, 58.79it/s, est. speed input: 4480.01 toks/s, output: 1665.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 201/377 [00:20<00:02, 63.66it/s, est. speed input: 4664.01 toks/s, output: 1763.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 209/377 [00:20<00:02, 65.06it/s, est. speed input: 4825.99 toks/s, output: 1852.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 216/377 [00:20<00:02, 63.68it/s, est. speed input: 4959.89 toks/s, output: 1930.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 223/377 [00:20<00:02, 59.56it/s, est. speed input: 5090.21 toks/s, output: 2008.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 230/377 [00:20<00:02, 53.02it/s, est. speed input: 5210.00 toks/s, output: 2083.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 239/377 [00:20<00:02, 57.39it/s, est. speed input: 5386.27 toks/s, output: 2191.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 246/377 [00:20<00:02, 59.11it/s, est. speed input: 5519.14 toks/s, output: 2276.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 253/377 [00:20<00:02, 58.79it/s, est. speed input: 5642.10 toks/s, output: 2361.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 262/377 [00:21<00:01, 65.18it/s, est. speed input: 5826.85 toks/s, output: 2478.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 269/377 [00:21<00:01, 63.11it/s, est. speed input: 5951.98 toks/s, output: 2566.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 276/377 [00:21<00:01, 58.80it/s, est. speed input: 6071.39 toks/s, output: 2653.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 282/377 [00:21<00:01, 55.89it/s, est. speed input: 6171.64 toks/s, output: 2730.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 291/377 [00:21<00:01, 63.41it/s, est. speed input: 6338.27 toks/s, output: 2856.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 299/377 [00:21<00:01, 62.73it/s, est. speed input: 6474.77 toks/s, output: 2964.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 306/377 [00:21<00:01, 48.22it/s, est. speed input: 6557.86 toks/s, output: 3049.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 314/377 [00:22<00:01, 52.31it/s, est. speed input: 6695.38 toks/s, output: 3166.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 320/377 [00:22<00:01, 45.96it/s, est. speed input: 6770.32 toks/s, output: 3244.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 328/377 [00:22<00:00, 53.01it/s, est. speed input: 6911.71 toks/s, output: 3372.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 334/377 [00:22<00:00, 48.07it/s, est. speed input: 6989.34 toks/s, output: 3458.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 340/377 [00:22<00:00, 44.23it/s, est. speed input: 7067.77 toks/s, output: 3545.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:22<00:00, 43.68it/s, est. speed input: 7137.28 toks/s, output: 3623.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 350/377 [00:23<00:00, 31.19it/s, est. speed input: 7148.66 toks/s, output: 3678.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 354/377 [00:23<00:00, 29.80it/s, est. speed input: 7184.15 toks/s, output: 3737.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 358/377 [00:23<00:00, 25.38it/s, est. speed input: 7195.69 toks/s, output: 3787.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:23<00:00, 17.63it/s, est. speed input: 7144.84 toks/s, output: 3797.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 365/377 [00:24<00:00, 18.67it/s, est. speed input: 7169.72 toks/s, output: 3863.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [00:24<00:00, 20.11it/s, est. speed input: 7197.13 toks/s, output: 3918.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:24<00:00, 21.03it/s, est. speed input: 7222.90 toks/s, output: 3973.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [00:25<00:00, 8.40it/s, est. speed input: 7009.50 toks/s, output: 3899.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 376/377 [00:25<00:00, 8.07it/s, est. speed input: 6967.57 toks/s, output: 3915.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 376/377 [00:36<00:00, 8.07it/s, est. speed input: 6967.57 toks/s, output: 3915.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 377/377 [01:01<00:00, 4.65s/it, est. speed input: 2885.27 toks/s, output: 1705.93 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:01<00:00, 6.10it/s, est. speed input: 2885.27 toks/s, output: 1705.93 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.2646125257015228, 'actor/pg_clipfrac': 0.0012330455938354135, 'actor/ppo_kl': -0.0009053767425939441}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.02436361275613308, 'actor/pg_clipfrac': 0.0006807352183386683, 'actor/ppo_kl': -8.992259245133027e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.34298354387283325, 'actor/pg_clipfrac': 0.0012300122762098908, 'actor/ppo_kl': 0.0023324317298829556}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.12138634920120239, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00011389964492991567}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.5115394592285156, 'actor/pg_clipfrac': 0.0012755101779475808, 'actor/ppo_kl': -0.0005784180830232799}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00012025028263451532, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00012135210999986157}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3247763216495514, 'actor/pg_clipfrac': 0.003842459060251713, 'actor/ppo_kl': -0.0015923851169645786}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.43751248717308044, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012654063757508993}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:27:30 [executor_base.py:219] It took 0.345554 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:29:19 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:27:30 [executor_base.py:219] It took 0.341073 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:29:20 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:29:20 [executor_base.py:208] It took 0.325663 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:29:25 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:29:25 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:29:25 [executor_base.py:208] It took 0.327324 seconds to fall asleep.
[36m(Runner pid=3309020)[0m validation generation end
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to consider the properties of the triangle and the given conditions. Since D is the midpoint of AB, AD = DB = 2.0. The line l is a vertical line passing through D, and E is any point on l. The perimeter of triangle AEC is the sum of the lengths of AE, EC, and AC. To minimize this perimeter, we need to minimize the lengths of AE and EC.\n\nWhen E is on the line l, the shortest distance from E to A and C is the distance along the line perpendicular to AB. Since D is the midpoint of AB, the shortest distance from A to D is AD, and the shortest distance from C to D is DC. Therefore, the minimum perimeter of triangle AEC is the sum of AD, DC, and AC.\n\nSince D is the midpoint of AB, AD = DC = 2.0. Therefore, the minimum perimeter of triangle AEC is AD + DC + AC = 2.0 + 2.0 + AC = 4.0 + AC.\n\nGiven that the perimeter of triangle ABC is 16.0, we can find AC by subtracting the lengths of AB and BC from the perimeter. Since D is the midpoint of AB, AD = DB = 2.0, so AB = 4.0. Let BC = x. Then, AC = √(AB^2 + BC^2) = √(4.0^2 + x^2) = √(16 + x^2).\n\nThe perimeter of triangle ABC is AB + BC + AC = 4.0 + x + √(16 + x^2) = 16.0. Solving for x, we get x + √(16 + x^2) = 12.0.\n\nTo find the minimum perimeter of triangle AEC, we need to minimize AC = √(16 + x^2). Since x is a positive value (as it represents a length), the minimum value of AC occurs when x = 0. Therefore, AC = √(16 + 0^2) = √16 = 4.0.\n\nThus, the minimum perimeter of triangle AEC is 4.0 + AC = 4.0 + 4.0 = 8.0.\n\nTherefore, the minimum perimeter of triangle AEC is 8.0.\n\n\nThe minimum perimeter of triangle AEC is 8.0.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated clockwise around point A by 90 degrees to form triangle AB'C', we know that angle BAC = 90 degrees. When we rotate a triangle around one of its vertices by 90 degrees, the resulting triangle will have the same angles as the original triangle, but the orientation will be different.\n\nSince angle BAC is 90 degrees, and point B is rotated to become point B', the angle between CC' and BB' will be the same as the angle between AC and AB, which is 90 degrees.\n\nGiven that angle CC'B' = 32 degrees, we can deduce that angle AC'B' is the same as angle CC'B' because they are corresponding angles in the rotation. Therefore, angle AC'B' is also 32 degrees.\n\n\nThe size of angle AC'B' is 32 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\n1. Since A, B, and D are vertices of a parallelogram ABCD, the opposite angles of the parallelogram are equal. Therefore, angle ABC = angle ADC = 54 degrees.\n2. Since BE is the diameter of circle O, angle AEB is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle.\n3. In triangle ABE, angle AEB is 90 degrees because BE is the diameter.\n4. Since angle ABC = 54 degrees and angle AEB is 90 degrees, angle ABE = 90 - 54 = 36 degrees.\n5. Therefore, angle AEB = 90 degrees - angle ABE = 90 degrees - 36 degrees = 54 degrees.\n\n\nThe degree of angle AEB is 54 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AB is the diameter of circle O and CD is tangent to circle O at point D, we can use the properties of tangents and circles. The angle between the tangent and the chord through the point of contact is equal to the angle in the alternate segment. Therefore, angle A is equal to angle C because they are both subtended by the same arc AD.\n\nSince angle A is given as 35.0 degrees, angle C must also be 35.0 degrees.\n\n\n35.0 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nTo find the measure of angle ABC, we can use the properties of a circle and the given angles. Since AD is the diameter of circle O, angle ABD is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. \n\nGiven that EA is tangent to the circle at A, angle EAD is also a right angle (90 degrees) because the angle between a tangent and a chord through the point of tangency is equal to the angle in the alternate segment. \n\nNow, we know that angle EAC is given as 120 degrees. Since angle EAD is 90 degrees, and angle EAC is 120 degrees, we can find angle DAC by subtracting the sum of angle EAD and angle EAC from 180 degrees (since they form a straight line):\n\nangle DAC = 180 - (angle EAD + angle EAC)\nangle DAC = 180 - (90 + 120)\nangle DAC = 180 - 210\nangle DAC = -30 degrees\n\nHowever, this calculation seems incorrect because angles in a circle cannot be negative. Let's correct the approach:\n\nSince angle EAD is 90 degrees and angle EAC is 120 degrees, we should be looking at the external angle at A, which is the sum of angle EAD and angle EAC minus the straight line angle (180 degrees). But since we are dealing with a circle, we should focus on the internal angles at B and C.\n\nSince AD is the diameter, angle ABD is 90 degrees. Now, we need to find angle ABC. We know that angle BAC is the same as angle EAC because they both intercept the same arc BC. So, angle BAC = 120 degrees.\n\nNow, in triangle ABC, we have:\nangle BAC = 120 degrees\nangle ABC + angle ACB = 180 degrees (since the sum of angles in a triangle is 180 degrees)\n\nWe also know that angle ACB = angle DAC because they are vertical angles. So, angle DAC = 180 - 90 - 120 = -30 degrees, which we corrected to be 30 degrees (since angles are always positive).\n\nSo, angle ABC + 30 = 180\nangle ABC = 180 - 30\nangle ABC = 150 degrees\n\nTherefore, the measure of angle ABC is 150 degrees.\n\n\n150 degrees\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_5
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_20/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_20/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_20/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 20
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.283
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.015
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: 2.5334670723964337e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.629
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.629
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 926228
[36m(Runner pid=3309020)[0m balanced_min: 926227
[36m(Runner pid=3309020)[0m max: 926228
[36m(Runner pid=3309020)[0m mean: 926227.5
[36m(Runner pid=3309020)[0m min: 926227
[36m(Runner pid=3309020)[0m minmax_diff: 1
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 110.415
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.114
[36m(Runner pid=3309020)[0m throughput: 886.518
[36m(Runner pid=3309020)[0m time_per_step: 1044.793
[36m(Runner pid=3309020)[0m total_num_tokens: 1852455
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:32:18, 4.34s/it, est. speed input: 106.81 toks/s, output: 21.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<42:33, 2.01s/it, est. speed input: 193.57 toks/s, output: 42.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:23<23:35, 1.12s/it, est. speed input: 286.18 toks/s, output: 60.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:26<17:44, 1.18it/s, est. speed input: 348.63 toks/s, output: 77.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<14:17, 1.46it/s, est. speed input: 404.53 toks/s, output: 90.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:28<10:27, 1.99it/s, est. speed input: 469.73 toks/s, output: 103.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:29<07:35, 2.73it/s, est. speed input: 537.16 toks/s, output: 122.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:30<06:13, 3.32it/s, est. speed input: 598.83 toks/s, output: 137.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:30<04:45, 4.33it/s, est. speed input: 667.98 toks/s, output: 155.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:30<03:49, 5.37it/s, est. speed input: 737.67 toks/s, output: 178.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:32<02:53, 7.02it/s, est. speed input: 916.58 toks/s, output: 222.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:32<02:31, 7.97it/s, est. speed input: 978.30 toks/s, output: 243.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:33<02:30, 7.99it/s, est. speed input: 1028.55 toks/s, output: 261.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:33<02:01, 9.85it/s, est. speed input: 1092.13 toks/s, output: 277.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:33<01:53, 10.56it/s, est. speed input: 1144.34 toks/s, output: 295.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:34<01:46, 11.18it/s, est. speed input: 1194.73 toks/s, output: 317.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:35<02:40, 7.39it/s, est. speed input: 1218.18 toks/s, output: 329.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:36<02:37, 7.47it/s, est. speed input: 1260.53 toks/s, output: 345.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<01:51, 10.47it/s, est. speed input: 1367.91 toks/s, output: 388.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:36<01:35, 12.16it/s, est. speed input: 1417.22 toks/s, output: 408.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:37<01:34, 12.21it/s, est. speed input: 1464.20 toks/s, output: 424.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<01:21, 14.16it/s, est. speed input: 1517.56 toks/s, output: 445.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:37<01:19, 14.41it/s, est. speed input: 1569.04 toks/s, output: 464.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:43, 26.16it/s, est. speed input: 1754.78 toks/s, output: 537.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:38<00:39, 28.34it/s, est. speed input: 1810.10 toks/s, output: 562.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:36, 30.61it/s, est. speed input: 1866.69 toks/s, output: 589.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:38<00:24, 44.61it/s, est. speed input: 2046.40 toks/s, output: 657.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:41, 26.75it/s, est. speed input: 2124.68 toks/s, output: 689.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:31, 34.18it/s, est. speed input: 2236.98 toks/s, output: 743.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:39<00:22, 48.24it/s, est. speed input: 2463.97 toks/s, output: 842.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:39<00:17, 61.54it/s, est. speed input: 2630.77 toks/s, output: 924.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:39<00:18, 56.94it/s, est. speed input: 2730.27 toks/s, output: 968.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:39<00:17, 58.53it/s, est. speed input: 2836.31 toks/s, output: 1012.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:40<00:18, 55.42it/s, est. speed input: 2932.66 toks/s, output: 1058.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:40<00:24, 40.94it/s, est. speed input: 3019.32 toks/s, output: 1100.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:40<00:15, 62.93it/s, est. speed input: 3239.16 toks/s, output: 1186.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:41<00:22, 43.72it/s, est. speed input: 3316.80 toks/s, output: 1231.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:41<00:22, 42.85it/s, est. speed input: 3405.66 toks/s, output: 1280.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:41<00:20, 47.58it/s, est. speed input: 3507.46 toks/s, output: 1319.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:41<00:17, 55.57it/s, est. speed input: 3612.87 toks/s, output: 1368.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:41<00:14, 67.03it/s, est. speed input: 3771.58 toks/s, output: 1434.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:41<00:12, 71.24it/s, est. speed input: 3914.25 toks/s, output: 1495.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:42<00:14, 62.60it/s, est. speed input: 4000.12 toks/s, output: 1539.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:42<00:14, 61.54it/s, est. speed input: 4088.20 toks/s, output: 1593.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:42<00:16, 53.39it/s, est. speed input: 4174.10 toks/s, output: 1635.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:42<00:18, 49.03it/s, est. speed input: 4260.70 toks/s, output: 1683.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:42<00:16, 52.13it/s, est. speed input: 4354.70 toks/s, output: 1733.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:43<00:15, 54.93it/s, est. speed input: 4441.40 toks/s, output: 1790.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:43<00:13, 61.31it/s, est. speed input: 4536.35 toks/s, output: 1847.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:43<00:10, 76.49it/s, est. speed input: 4681.69 toks/s, output: 1936.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:43<00:11, 73.24it/s, est. speed input: 4772.79 toks/s, output: 1980.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:43<00:09, 87.77it/s, est. speed input: 4922.27 toks/s, output: 2058.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:43<00:07, 101.58it/s, est. speed input: 5068.80 toks/s, output: 2132.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:43<00:08, 96.55it/s, est. speed input: 5207.27 toks/s, output: 2208.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:44<00:06, 120.05it/s, est. speed input: 5404.74 toks/s, output: 2327.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:44<00:07, 96.31it/s, est. speed input: 5535.88 toks/s, output: 2405.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:44<00:07, 95.53it/s, est. speed input: 5675.65 toks/s, output: 2496.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:44<00:07, 95.73it/s, est. speed input: 5816.31 toks/s, output: 2583.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:44<00:07, 96.18it/s, est. speed input: 5947.93 toks/s, output: 2670.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:44<00:06, 112.92it/s, est. speed input: 6140.56 toks/s, output: 2757.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:45<00:07, 88.50it/s, est. speed input: 6251.23 toks/s, output: 2828.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:45<00:08, 77.16it/s, est. speed input: 6366.36 toks/s, output: 2900.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:45<00:05, 119.44it/s, est. speed input: 6788.04 toks/s, output: 3150.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:45<00:04, 137.04it/s, est. speed input: 7025.10 toks/s, output: 3296.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:45<00:04, 127.86it/s, est. speed input: 7197.56 toks/s, output: 3386.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:46<00:04, 132.17it/s, est. speed input: 7379.30 toks/s, output: 3495.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:46<00:04, 127.14it/s, est. speed input: 7515.55 toks/s, output: 3586.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:46<00:03, 129.12it/s, est. speed input: 7651.36 toks/s, output: 3669.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:46<00:03, 136.52it/s, est. speed input: 7828.91 toks/s, output: 3798.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:46<00:03, 121.78it/s, est. speed input: 7955.78 toks/s, output: 3880.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:46<00:03, 150.72it/s, est. speed input: 8184.68 toks/s, output: 4022.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:47<00:04, 95.28it/s, est. speed input: 8313.27 toks/s, output: 4115.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:47<00:03, 110.77it/s, est. speed input: 8529.05 toks/s, output: 4265.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:47<00:03, 111.76it/s, est. speed input: 8655.78 toks/s, output: 4338.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:47<00:03, 97.62it/s, est. speed input: 8763.10 toks/s, output: 4407.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:47<00:03, 114.23it/s, est. speed input: 8976.49 toks/s, output: 4556.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:47<00:03, 104.27it/s, est. speed input: 9093.83 toks/s, output: 4641.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:48<00:02, 129.20it/s, est. speed input: 9307.61 toks/s, output: 4793.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:48<00:01, 147.11it/s, est. speed input: 9531.46 toks/s, output: 4967.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:48<00:01, 151.51it/s, est. speed input: 9699.90 toks/s, output: 5095.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:48<00:02, 98.43it/s, est. speed input: 9810.97 toks/s, output: 5185.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:48<00:02, 98.58it/s, est. speed input: 9923.80 toks/s, output: 5294.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:49<00:02, 90.46it/s, est. speed input: 10059.94 toks/s, output: 5399.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:49<00:02, 96.65it/s, est. speed input: 10217.21 toks/s, output: 5503.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:49<00:01, 104.66it/s, est. speed input: 10334.49 toks/s, output: 5590.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:49<00:01, 98.43it/s, est. speed input: 10438.61 toks/s, output: 5696.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:49<00:01, 94.94it/s, est. speed input: 10553.02 toks/s, output: 5793.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:49<00:01, 83.20it/s, est. speed input: 10642.59 toks/s, output: 5878.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:50<00:01, 68.84it/s, est. speed input: 10777.35 toks/s, output: 6035.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:50<00:01, 70.79it/s, est. speed input: 10903.99 toks/s, output: 6194.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:50<00:01, 73.12it/s, est. speed input: 10965.33 toks/s, output: 6268.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:50<00:01, 65.96it/s, est. speed input: 11024.10 toks/s, output: 6338.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:51<00:00, 61.66it/s, est. speed input: 11077.90 toks/s, output: 6424.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:51<00:00, 57.25it/s, est. speed input: 11123.95 toks/s, output: 6492.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:51<00:00, 63.38it/s, est. speed input: 11186.09 toks/s, output: 6572.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:51<00:00, 40.72it/s, est. speed input: 11171.24 toks/s, output: 6601.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 39.57it/s, est. speed input: 11197.91 toks/s, output: 6644.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:52<00:00, 37.08it/s, est. speed input: 11200.34 toks/s, output: 6665.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:52<00:00, 27.77it/s, est. speed input: 11168.82 toks/s, output: 6706.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 16.25it/s, est. speed input: 11030.13 toks/s, output: 6654.43 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 23.78it/s, est. speed input: 11030.13 toks/s, output: 6654.43 toks/s]
[36m(Runner pid=3309020)[0m max: 709.0
[36m(Runner pid=3309020)[0m mean: 464.717
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1580.0
[36m(Runner pid=3309020)[0m mean: 258.898
[36m(Runner pid=3309020)[0m min: 10.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.261
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.629
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.152
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.303
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.196
[36m(Runner pid=3309020)[0m gen: 100.993
[36m(Runner pid=3309020)[0m old: 84.933
[36m(Runner pid=3309020)[0m ref: 86.469
[36m(Runner pid=3309020)[0m reward: 6.652
[36m(Runner pid=3309020)[0m save_checkpoint: 32.629
[36m(Runner pid=3309020)[0m step: 1044.793
[36m(Runner pid=3309020)[0m update_actor: 560.447
[36m(Runner pid=3309020)[0m validation: 171.705
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.318
[36m(Runner pid=3309020)[0m format_reward: 0.989
[36m(Runner pid=3309020)[0m overall_reward: 0.654
[36m(Runner pid=3309020)[0m reward_score: 0.654
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.992
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 21; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_20/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_20/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_20/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:30:23 [executor_base.py:219] It took 0.352866 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:31:47 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:30:23 [executor_base.py:219] It took 0.343301 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:31:47 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:31:47 [executor_base.py:208] It took 0.326931 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.71 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.79 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:31:49 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:31:49 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.79 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:31:49 [executor_base.py:208] It took 0.326820 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.11794805526733398, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2609775960445404, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.20214703679084778, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.032259054481983185, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.4623108208179474, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008654643315821886}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.4391816556453705, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.08092682808637619, 'actor/pg_clipfrac': 0.0020855057518929243, 'actor/ppo_kl': 0.00036791342427022755}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.09936345368623734, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00022415939019992948, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000775768596213311}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.17805251479148865, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003623850643634796}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00011238871957175434, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00015396861999761313}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.3893711566925049, 'actor/pg_clipfrac': 0.0059612519107759, 'actor/ppo_kl': -0.0001334035478066653}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.14117412269115448, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00010665306763257831, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.5002735257148743, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3702548146247864, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00043882866157218814}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.09383013844490051, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010922999354079366}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.07858575880527496, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012097186408936977}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.3312404453754425, 'actor/pg_clipfrac': 0.0006361323175951838, 'actor/ppo_kl': -0.0008095469675026834}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00016800915182102472, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -2.7734691684599966e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.3388203978538513, 'actor/pg_clipfrac': 0.0014556040987372398, 'actor/ppo_kl': 0.00018484068277757615}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.17437642812728882, 'actor/pg_clipfrac': 0.0018484288593754172, 'actor/ppo_kl': -0.0010377492289990187}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00017045345157384872, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013724961318075657}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.107327401638031, 'actor/pg_clipfrac': 0.0007429420365951955, 'actor/ppo_kl': -0.0005550257046706975}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2939690053462982, 'actor/pg_clipfrac': 0.002322880318388343, 'actor/ppo_kl': 0.0006194086745381355}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.4269119203090668, 'actor/pg_clipfrac': 0.0011750881094485521, 'actor/ppo_kl': 0.0002780672220978886}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.07643923163414001, 'actor/pg_clipfrac': 0.0011961722047999501, 'actor/ppo_kl': -0.000244446360738948}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.5482642650604248, 'actor/pg_clipfrac': 0.0042553190141916275, 'actor/ppo_kl': -0.0008242602925747633}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.16594509780406952, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003364047151990235}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.251797616481781, 'actor/pg_clipfrac': 0.000871839583851397, 'actor/ppo_kl': 7.633219502167776e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00013094012683723122, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000370380119420588}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.48621174693107605, 'actor/pg_clipfrac': 0.0011547344038262963, 'actor/ppo_kl': 0.0016606509452685714}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00022979314962867647, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016038697212934494}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.11913333833217621, 'actor/pg_clipfrac': 0.0011325028026476502, 'actor/ppo_kl': 0.0013075274182483554}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.26349034905433655, 'actor/pg_clipfrac': 0.001805054140277207, 'actor/ppo_kl': -0.002348576206713915}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.5560543537139893, 'actor/pg_clipfrac': 0.002051281975582242, 'actor/ppo_kl': -0.0004427102976478636}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00013952673180028796, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003786681918427348}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.04306656867265701, 'actor/pg_clipfrac': 0.002912621246650815, 'actor/ppo_kl': 0.0003182216896675527}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.5876293182373047, 'actor/pg_clipfrac': 0.0006920415326021612, 'actor/ppo_kl': -0.00019195715140085667}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00022051793348509818, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003886694903485477}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00017964189464692026, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000135565540404059}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.5791730880737305, 'actor/pg_clipfrac': 0.002281368710100651, 'actor/ppo_kl': 0.001108572818338871}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.5483089089393616, 'actor/pg_clipfrac': 0.0013297871919348836, 'actor/ppo_kl': 0.000805829418823123}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0001502897503087297, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -9.735009371070191e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0003279221709817648, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014045514399185777}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00021791859762743115, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004797661677002907}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.45309746265411377, 'actor/pg_clipfrac': 0.002188183832913637, 'actor/ppo_kl': 0.0014120369451120496}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00017651279631536454, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009477948187850416}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00015868053014855832, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007555763004347682}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.731454610824585, 'actor/pg_clipfrac': 0.0008944543660618365, 'actor/ppo_kl': -0.0006249194266274571}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.32724493741989136, 'actor/pg_clipfrac': 0.001054852269589901, 'actor/ppo_kl': -0.0014305355725809932}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00012113324191886932, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006649832939729095}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00024159935128409415, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00228113099001348}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.44779348373413086, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001588611921761185}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1813131421804428, 'actor/pg_clipfrac': 0.0011415524641051888, 'actor/ppo_kl': 0.001853877562098205}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.12199889868497849, 'actor/pg_clipfrac': 0.0010427528759464622, 'actor/ppo_kl': 5.838794095325284e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.14756353199481964, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001283665420487523}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.1446278989315033, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012257815105840564}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.21157687902450562, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00037188632995821536}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.08773206174373627, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00013544173270929605}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.2090013474225998, 'actor/pg_clipfrac': 0.0010672358330339193, 'actor/ppo_kl': -0.0012575940927490592}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.29681625962257385, 'actor/pg_clipfrac': 0.0008285004296340048, 'actor/ppo_kl': -0.00047186255687847733}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.6542560458183289, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000609366106800735}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.11498378962278366, 'actor/pg_clipfrac': 0.00074074073927477, 'actor/ppo_kl': -0.0011960517149418592}
[36m(Runner pid=3309020)[0m Step 21
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:33:11, 4.39s/it, est. speed input: 106.72 toks/s, output: 24.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<41:41, 1.97s/it, est. speed input: 197.02 toks/s, output: 47.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:24<25:12, 1.20s/it, est. speed input: 275.55 toks/s, output: 66.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:25<16:51, 1.25it/s, est. speed input: 352.52 toks/s, output: 84.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:26<11:53, 1.76it/s, est. speed input: 425.06 toks/s, output: 106.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:27<09:04, 2.30it/s, est. speed input: 496.47 toks/s, output: 127.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:28<07:24, 2.80it/s, est. speed input: 572.06 toks/s, output: 141.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:29<06:16, 3.29it/s, est. speed input: 634.37 toks/s, output: 160.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:29<05:12, 3.95it/s, est. speed input: 690.36 toks/s, output: 176.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:31<05:26, 3.77it/s, est. speed input: 731.93 toks/s, output: 190.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:31<04:22, 4.67it/s, est. speed input: 793.26 toks/s, output: 205.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:32<03:42, 5.48it/s, est. speed input: 851.44 toks/s, output: 221.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:32<02:51, 7.08it/s, est. speed input: 911.12 toks/s, output: 244.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:33<01:48, 11.14it/s, est. speed input: 1043.99 toks/s, output: 286.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:33<01:18, 15.13it/s, est. speed input: 1176.41 toks/s, output: 331.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:17, 15.43it/s, est. speed input: 1237.05 toks/s, output: 353.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:34<02:05, 9.47it/s, est. speed input: 1262.48 toks/s, output: 361.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:34<01:20, 14.57it/s, est. speed input: 1390.59 toks/s, output: 402.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:35<01:21, 14.44it/s, est. speed input: 1443.08 toks/s, output: 419.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:35<01:11, 16.28it/s, est. speed input: 1548.37 toks/s, output: 457.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:36<01:16, 15.11it/s, est. speed input: 1595.46 toks/s, output: 476.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:36<00:55, 20.50it/s, est. speed input: 1711.77 toks/s, output: 520.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:36<00:46, 24.35it/s, est. speed input: 1873.03 toks/s, output: 574.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:37<00:48, 23.06it/s, est. speed input: 1919.40 toks/s, output: 590.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:37<00:42, 26.55it/s, est. speed input: 2024.45 toks/s, output: 625.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:37<00:39, 28.22it/s, est. speed input: 2081.30 toks/s, output: 649.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:38<00:46, 23.73it/s, est. speed input: 2168.16 toks/s, output: 680.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:38<00:47, 23.18it/s, est. speed input: 2219.53 toks/s, output: 705.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:38<00:46, 23.53it/s, est. speed input: 2308.95 toks/s, output: 752.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:39<00:44, 23.98it/s, est. speed input: 2402.09 toks/s, output: 794.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:39<00:35, 29.82it/s, est. speed input: 2505.92 toks/s, output: 843.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:39<00:51, 20.75it/s, est. speed input: 2523.10 toks/s, output: 853.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:39<00:36, 28.65it/s, est. speed input: 2628.65 toks/s, output: 902.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:40<00:40, 26.12it/s, est. speed input: 2668.16 toks/s, output: 923.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:40<00:30, 33.42it/s, est. speed input: 2768.65 toks/s, output: 953.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:40<00:43, 23.49it/s, est. speed input: 2798.13 toks/s, output: 976.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:40<00:31, 32.76it/s, est. speed input: 2898.64 toks/s, output: 1020.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:41<00:28, 34.87it/s, est. speed input: 2998.96 toks/s, output: 1066.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:41<00:17, 55.12it/s, est. speed input: 3210.76 toks/s, output: 1161.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:41<00:16, 60.87it/s, est. speed input: 3367.50 toks/s, output: 1236.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:41<00:17, 54.31it/s, est. speed input: 3457.65 toks/s, output: 1292.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:42<00:18, 50.76it/s, est. speed input: 3538.73 toks/s, output: 1341.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:16, 55.93it/s, est. speed input: 3686.37 toks/s, output: 1407.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:42<00:10, 85.26it/s, est. speed input: 3996.54 toks/s, output: 1545.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:42<00:12, 69.48it/s, est. speed input: 4087.07 toks/s, output: 1592.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:42<00:12, 71.41it/s, est. speed input: 4183.78 toks/s, output: 1635.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:42<00:07, 108.18it/s, est. speed input: 4494.30 toks/s, output: 1803.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:43<00:10, 76.95it/s, est. speed input: 4619.96 toks/s, output: 1876.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:43<00:13, 63.04it/s, est. speed input: 4739.39 toks/s, output: 1951.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:43<00:09, 82.86it/s, est. speed input: 4983.01 toks/s, output: 2091.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:43<00:09, 86.91it/s, est. speed input: 5116.94 toks/s, output: 2155.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:44<00:07, 96.92it/s, est. speed input: 5262.42 toks/s, output: 2234.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:44<00:09, 76.08it/s, est. speed input: 5379.18 toks/s, output: 2304.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:44<00:08, 87.99it/s, est. speed input: 5523.70 toks/s, output: 2377.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:44<00:10, 70.03it/s, est. speed input: 5638.69 toks/s, output: 2446.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:44<00:09, 75.88it/s, est. speed input: 5776.82 toks/s, output: 2520.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:45<00:06, 102.15it/s, est. speed input: 6019.01 toks/s, output: 2644.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:45<00:07, 92.50it/s, est. speed input: 6143.72 toks/s, output: 2723.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:45<00:06, 100.31it/s, est. speed input: 6280.34 toks/s, output: 2800.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:45<00:05, 107.65it/s, est. speed input: 6420.90 toks/s, output: 2873.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:45<00:04, 130.09it/s, est. speed input: 6657.56 toks/s, output: 3019.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:45<00:03, 155.82it/s, est. speed input: 6996.66 toks/s, output: 3209.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:46<00:04, 127.38it/s, est. speed input: 7159.85 toks/s, output: 3345.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:46<00:03, 147.73it/s, est. speed input: 7396.50 toks/s, output: 3503.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:46<00:03, 132.04it/s, est. speed input: 7563.00 toks/s, output: 3589.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:46<00:04, 106.48it/s, est. speed input: 7672.58 toks/s, output: 3680.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:46<00:03, 132.34it/s, est. speed input: 7906.22 toks/s, output: 3826.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:46<00:03, 130.00it/s, est. speed input: 8084.57 toks/s, output: 3936.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:47<00:03, 115.08it/s, est. speed input: 8211.09 toks/s, output: 4019.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:47<00:04, 89.98it/s, est. speed input: 8314.60 toks/s, output: 4109.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:47<00:04, 83.80it/s, est. speed input: 8423.96 toks/s, output: 4180.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:47<00:05, 69.11it/s, est. speed input: 8479.63 toks/s, output: 4225.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:47<00:04, 89.65it/s, est. speed input: 8721.61 toks/s, output: 4417.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:48<00:02, 113.62it/s, est. speed input: 9025.51 toks/s, output: 4653.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:48<00:03, 107.10it/s, est. speed input: 9142.96 toks/s, output: 4721.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:48<00:02, 126.98it/s, est. speed input: 9367.79 toks/s, output: 4915.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:48<00:02, 119.99it/s, est. speed input: 9488.49 toks/s, output: 5021.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:48<00:02, 112.03it/s, est. speed input: 9590.66 toks/s, output: 5115.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:48<00:02, 112.67it/s, est. speed input: 9713.93 toks/s, output: 5242.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:49<00:02, 111.56it/s, est. speed input: 9829.60 toks/s, output: 5325.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:49<00:01, 139.86it/s, est. speed input: 10045.16 toks/s, output: 5482.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:49<00:01, 124.04it/s, est. speed input: 10156.14 toks/s, output: 5581.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:49<00:01, 141.29it/s, est. speed input: 10415.44 toks/s, output: 5814.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:49<00:01, 138.77it/s, est. speed input: 10532.30 toks/s, output: 5923.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:49<00:01, 87.87it/s, est. speed input: 10599.95 toks/s, output: 6013.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:50<00:01, 102.23it/s, est. speed input: 10767.57 toks/s, output: 6151.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:50<00:01, 68.62it/s, est. speed input: 10822.36 toks/s, output: 6232.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:50<00:01, 56.10it/s, est. speed input: 10851.61 toks/s, output: 6294.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:50<00:01, 60.63it/s, est. speed input: 10919.37 toks/s, output: 6366.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:51<00:00, 75.99it/s, est. speed input: 11078.61 toks/s, output: 6540.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:51<00:00, 72.62it/s, est. speed input: 11164.41 toks/s, output: 6636.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:51<00:00, 77.17it/s, est. speed input: 11229.63 toks/s, output: 6712.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:52<00:00, 36.71it/s, est. speed input: 11156.32 toks/s, output: 6689.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 41.00it/s, est. speed input: 11209.34 toks/s, output: 6767.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:52<00:00, 35.58it/s, est. speed input: 11220.44 toks/s, output: 6829.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 19.46it/s, est. speed input: 11074.18 toks/s, output: 6805.87 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 23.77it/s, est. speed input: 11074.18 toks/s, output: 6805.87 toks/s]
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.273
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.011
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.014
[36m(Runner pid=3309020)[0m ppo_kl: 7.83040957010428e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.019
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.019
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.632
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.632
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 948079
[36m(Runner pid=3309020)[0m balanced_min: 948079
[36m(Runner pid=3309020)[0m max: 951941
[36m(Runner pid=3309020)[0m mean: 948079.0
[36m(Runner pid=3309020)[0m min: 944217
[36m(Runner pid=3309020)[0m minmax_diff: 7724
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 109.936
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.117
[36m(Runner pid=3309020)[0m throughput: 1127.311
[36m(Runner pid=3309020)[0m time_per_step: 841.009
[36m(Runner pid=3309020)[0m total_num_tokens: 1896158
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 661.0
[36m(Runner pid=3309020)[0m mean: 465.592
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1197.0
[36m(Runner pid=3309020)[0m mean: 275.095
[36m(Runner pid=3309020)[0m min: 54.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.268
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.632
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 6.265524766269115e-05
[36m(Runner pid=3309020)[0m gen: 0.149
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.295
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.119
[36m(Runner pid=3309020)[0m gen: 104.863
[36m(Runner pid=3309020)[0m old: 84.397
[36m(Runner pid=3309020)[0m ref: 84.773
[36m(Runner pid=3309020)[0m reward: 7.304
[36m(Runner pid=3309020)[0m step: 841.009
[36m(Runner pid=3309020)[0m update_actor: 558.939
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 22; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:44:22 [executor_base.py:219] It took 0.339611 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:44:22 [executor_base.py:219] It took 0.339594 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:45:47 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:45:47 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:45:47 [executor_base.py:208] It took 0.326981 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:45:53 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:45:54 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:45:54 [executor_base.py:208] It took 0.327389 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.3180846571922302, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.04482090473175049, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00018868425104301423, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.1200961247086525, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.6697736382484436, 'actor/pg_clipfrac': 0.001208459259942174, 'actor/ppo_kl': -0.00030378991505131125}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0001485396787757054, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00015996441652532667}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002549569180700928, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000986968050710857}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1856229454278946, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.5278287529945374, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008021787507459521}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.44853484630584717, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.05151556432247162, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.24152225255966187, 'actor/pg_clipfrac': 0.0011210762895643711, 'actor/ppo_kl': -0.0004466146638151258}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00012788025196641684, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012264170218259096}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.02346484735608101, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.43344220519065857, 'actor/pg_clipfrac': 0.002751031657680869, 'actor/ppo_kl': -0.000511562975589186}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0001846873783506453, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0024732097517699003}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0001849499240051955, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009660496725700796}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.7300638556480408, 'actor/pg_clipfrac': 0.0011709601385518909, 'actor/ppo_kl': 0.0005879480158910155}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.11900090426206589, 'actor/pg_clipfrac': 0.0010373444529250264, 'actor/ppo_kl': 5.726003109884914e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.4789908230304718, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016475365264341235}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.1024080365896225, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011037900112569332}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.2855193614959717, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001045788885676302}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.10584557801485062, 'actor/pg_clipfrac': 0.0006127451197244227, 'actor/ppo_kl': 0.0006354801589623094}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.532242476940155, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00014671946701128036}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.40962255001068115, 'actor/pg_clipfrac': 0.001282873679883778, 'actor/ppo_kl': 0.0004708961059805006}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.20306441187858582, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00019867783703375608}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0001457131584174931, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001239488017745316}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.037245869636535645, 'actor/pg_clipfrac': 0.0011428571306169033, 'actor/ppo_kl': 0.0003062656905967742}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.1609070599079132, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014064443530514836}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.31412777304649353, 'actor/pg_clipfrac': 0.003144653979688883, 'actor/ppo_kl': -0.0005019385716877878}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.23770378530025482, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00016568518185522407}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.059862639755010605, 'actor/pg_clipfrac': 0.0025884383358061314, 'actor/ppo_kl': -0.0012108684750273824}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.24853748083114624, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005668511730618775}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.35656917095184326, 'actor/pg_clipfrac': 0.0021929824724793434, 'actor/ppo_kl': -0.0006816178211010993}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.11211712658405304, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006620704662054777}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.39640870690345764, 'actor/pg_clipfrac': 0.001743679167702794, 'actor/ppo_kl': 0.00029953187913633883}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.1977931559085846, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002965521707665175}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0001309688959736377, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00020286321523599327}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.2024955153465271, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00024837616365402937}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.9724692702293396, 'actor/pg_clipfrac': 0.0022740194108337164, 'actor/ppo_kl': 0.0007870502304285765}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5191360116004944, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012160064652562141}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.12715697288513184, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018614111468195915}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.6690107583999634, 'actor/pg_clipfrac': 0.0025862068869173527, 'actor/ppo_kl': -0.00031223296537064016}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.19079340994358063, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005735502927564085}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00014192140952218324, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010703880107030272}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.18472053110599518, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006678058416582644}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.9914534091949463, 'actor/pg_clipfrac': 0.0008291873964481056, 'actor/ppo_kl': 0.0012505801860243082}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.41670551896095276, 'actor/pg_clipfrac': 0.0017241379246115685, 'actor/ppo_kl': 0.0010443401988595724}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.10795973241329193, 'actor/pg_clipfrac': 0.0015197568573057652, 'actor/ppo_kl': -0.000974800088442862}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00020170738571323454, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004499334027059376}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0003083731862716377, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014175421092659235}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00021229266712907702, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00014554358494933695}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.06671316921710968, 'actor/pg_clipfrac': 0.0026881720405071974, 'actor/ppo_kl': 0.0006972948904149234}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -1.01303231716156, 'actor/pg_clipfrac': 0.0028680688701570034, 'actor/ppo_kl': -0.0010282195871695876}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3999955952167511, 'actor/pg_clipfrac': 0.0011111111380159855, 'actor/ppo_kl': 0.00010517331975279376}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0427069365978241, 'actor/pg_clipfrac': 0.0016764459433034062, 'actor/ppo_kl': 0.0001320786977885291}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.19568593800067902, 'actor/pg_clipfrac': 0.001320132054388523, 'actor/ppo_kl': 0.0002740866329986602}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.08398950845003128, 'actor/pg_clipfrac': 0.0012722646351903677, 'actor/ppo_kl': 0.0012577605666592717}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:34:23, 4.44s/it, est. speed input: 104.92 toks/s, output: 21.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<41:47, 1.97s/it, est. speed input: 199.81 toks/s, output: 45.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:23<22:56, 1.09s/it, est. speed input: 296.91 toks/s, output: 66.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:25<16:29, 1.27it/s, est. speed input: 369.81 toks/s, output: 85.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:26<12:42, 1.64it/s, est. speed input: 438.28 toks/s, output: 104.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:26<08:35, 2.42it/s, est. speed input: 521.78 toks/s, output: 130.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:27<06:09, 3.37it/s, est. speed input: 601.35 toks/s, output: 149.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:27<05:02, 4.10it/s, est. speed input: 665.44 toks/s, output: 167.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:29<05:20, 3.85it/s, est. speed input: 714.33 toks/s, output: 180.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:30<04:42, 4.35it/s, est. speed input: 768.62 toks/s, output: 194.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:30<04:00, 5.10it/s, est. speed input: 830.10 toks/s, output: 211.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:31<03:25, 5.93it/s, est. speed input: 888.92 toks/s, output: 229.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:31<03:02, 6.66it/s, est. speed input: 947.84 toks/s, output: 248.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:31<02:23, 8.43it/s, est. speed input: 1013.94 toks/s, output: 270.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:32<01:33, 12.88it/s, est. speed input: 1151.15 toks/s, output: 310.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:32<01:23, 14.37it/s, est. speed input: 1216.08 toks/s, output: 323.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:38, 12.09it/s, est. speed input: 1267.51 toks/s, output: 336.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:33<01:30, 13.10it/s, est. speed input: 1321.21 toks/s, output: 358.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:34<01:39, 11.86it/s, est. speed input: 1420.70 toks/s, output: 397.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:34<01:34, 12.37it/s, est. speed input: 1472.82 toks/s, output: 413.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:34<01:02, 18.62it/s, est. speed input: 1600.63 toks/s, output: 464.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:35<01:02, 18.57it/s, est. speed input: 1655.47 toks/s, output: 486.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:35<00:36, 31.34it/s, est. speed input: 1839.92 toks/s, output: 558.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:35<00:34, 32.99it/s, est. speed input: 1957.23 toks/s, output: 602.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:35<00:30, 36.90it/s, est. speed input: 2074.65 toks/s, output: 647.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:36<00:50, 21.90it/s, est. speed input: 2102.40 toks/s, output: 663.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:36<00:32, 34.29it/s, est. speed input: 2277.90 toks/s, output: 740.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:36<00:28, 38.55it/s, est. speed input: 2381.69 toks/s, output: 785.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:36<00:29, 37.20it/s, est. speed input: 2482.56 toks/s, output: 833.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:37<00:33, 32.40it/s, est. speed input: 2582.96 toks/s, output: 874.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:37<00:39, 26.83it/s, est. speed input: 2619.56 toks/s, output: 892.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:37<00:22, 46.75it/s, est. speed input: 2860.33 toks/s, output: 998.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:38<00:25, 41.04it/s, est. speed input: 2956.73 toks/s, output: 1034.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:38<00:22, 45.51it/s, est. speed input: 3066.68 toks/s, output: 1079.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:38<00:21, 46.22it/s, est. speed input: 3169.25 toks/s, output: 1127.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:38<00:18, 54.46it/s, est. speed input: 3282.41 toks/s, output: 1169.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:38<00:15, 62.54it/s, est. speed input: 3395.76 toks/s, output: 1223.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:39<00:21, 46.38it/s, est. speed input: 3479.05 toks/s, output: 1267.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:39<00:13, 73.70it/s, est. speed input: 3753.07 toks/s, output: 1384.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:39<00:14, 67.05it/s, est. speed input: 3848.60 toks/s, output: 1426.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:39<00:15, 62.65it/s, est. speed input: 3947.80 toks/s, output: 1474.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:39<00:12, 72.68it/s, est. speed input: 4109.68 toks/s, output: 1554.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:39<00:11, 78.03it/s, est. speed input: 4321.20 toks/s, output: 1662.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:40<00:09, 94.86it/s, est. speed input: 4527.73 toks/s, output: 1757.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:40<00:10, 80.94it/s, est. speed input: 4664.53 toks/s, output: 1816.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:40<00:12, 70.65it/s, est. speed input: 4755.63 toks/s, output: 1862.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:40<00:10, 80.79it/s, est. speed input: 4908.43 toks/s, output: 1916.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:40<00:10, 81.24it/s, est. speed input: 5105.63 toks/s, output: 1998.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:41<00:13, 60.85it/s, est. speed input: 5178.20 toks/s, output: 2033.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:41<00:16, 48.39it/s, est. speed input: 5251.87 toks/s, output: 2082.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:41<00:13, 59.67it/s, est. speed input: 5453.98 toks/s, output: 2193.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:41<00:10, 75.67it/s, est. speed input: 5659.42 toks/s, output: 2296.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:42<00:07, 98.43it/s, est. speed input: 5916.75 toks/s, output: 2427.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:42<00:07, 101.52it/s, est. speed input: 6067.37 toks/s, output: 2511.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:42<00:06, 116.23it/s, est. speed input: 6306.24 toks/s, output: 2649.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:42<00:06, 109.70it/s, est. speed input: 6439.71 toks/s, output: 2727.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:42<00:07, 95.10it/s, est. speed input: 6566.62 toks/s, output: 2804.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:42<00:07, 86.36it/s, est. speed input: 6690.90 toks/s, output: 2858.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:43<00:05, 122.55it/s, est. speed input: 7064.34 toks/s, output: 3055.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:43<00:05, 105.51it/s, est. speed input: 7194.71 toks/s, output: 3127.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:43<00:05, 100.72it/s, est. speed input: 7368.53 toks/s, output: 3213.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:43<00:05, 97.23it/s, est. speed input: 7499.15 toks/s, output: 3286.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:43<00:05, 98.92it/s, est. speed input: 7635.94 toks/s, output: 3370.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:44<00:05, 97.25it/s, est. speed input: 7811.90 toks/s, output: 3482.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:44<00:06, 82.02it/s, est. speed input: 7880.75 toks/s, output: 3520.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:44<00:06, 84.38it/s, est. speed input: 7967.25 toks/s, output: 3570.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:44<00:05, 86.89it/s, est. speed input: 8049.92 toks/s, output: 3632.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:44<00:06, 71.94it/s, est. speed input: 8116.58 toks/s, output: 3676.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:44<00:05, 83.69it/s, est. speed input: 8253.40 toks/s, output: 3770.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:44<00:03, 114.90it/s, est. speed input: 8489.36 toks/s, output: 3914.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:45<00:03, 114.20it/s, est. speed input: 8672.77 toks/s, output: 4017.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:45<00:03, 123.06it/s, est. speed input: 8856.27 toks/s, output: 4134.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:45<00:03, 117.24it/s, est. speed input: 9018.04 toks/s, output: 4228.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:45<00:02, 139.93it/s, est. speed input: 9277.16 toks/s, output: 4413.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:45<00:03, 110.12it/s, est. speed input: 9385.50 toks/s, output: 4493.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:45<00:02, 117.08it/s, est. speed input: 9529.88 toks/s, output: 4588.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:46<00:02, 109.06it/s, est. speed input: 9691.12 toks/s, output: 4731.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:46<00:02, 137.69it/s, est. speed input: 9955.99 toks/s, output: 4930.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:46<00:01, 140.32it/s, est. speed input: 10140.57 toks/s, output: 5067.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:46<00:01, 132.66it/s, est. speed input: 10264.75 toks/s, output: 5167.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:46<00:02, 109.91it/s, est. speed input: 10384.68 toks/s, output: 5271.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:46<00:02, 102.17it/s, est. speed input: 10492.97 toks/s, output: 5357.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:47<00:01, 119.48it/s, est. speed input: 10667.52 toks/s, output: 5503.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:47<00:01, 129.16it/s, est. speed input: 10841.80 toks/s, output: 5653.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:47<00:01, 104.17it/s, est. speed input: 11011.94 toks/s, output: 5789.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:47<00:01, 107.59it/s, est. speed input: 11120.61 toks/s, output: 5909.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:47<00:01, 71.14it/s, est. speed input: 11175.27 toks/s, output: 6000.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:48<00:01, 81.55it/s, est. speed input: 11285.08 toks/s, output: 6105.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:48<00:00, 96.11it/s, est. speed input: 11451.91 toks/s, output: 6241.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:48<00:00, 88.97it/s, est. speed input: 11554.01 toks/s, output: 6358.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:48<00:00, 97.03it/s, est. speed input: 11674.18 toks/s, output: 6497.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:48<00:00, 94.70it/s, est. speed input: 11782.48 toks/s, output: 6615.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:48<00:00, 83.72it/s, est. speed input: 11864.40 toks/s, output: 6704.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:49<00:00, 47.01it/s, est. speed input: 11822.63 toks/s, output: 6740.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:49<00:00, 40.20it/s, est. speed input: 11829.98 toks/s, output: 6797.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:51<00:00, 20.70it/s, est. speed input: 11649.34 toks/s, output: 6772.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 9.37it/s, est. speed input: 11219.58 toks/s, output: 6559.12 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 24.05it/s, est. speed input: 11219.58 toks/s, output: 6559.12 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.39684155583381653, 'actor/pg_clipfrac': 0.0031796502880752087, 'actor/ppo_kl': 0.0010154190240427852}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00013983964163344353, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003860684228129685}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1523931473493576, 'actor/pg_clipfrac': 0.0024479804560542107, 'actor/ppo_kl': 0.0008562127477489412}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.3060188889503479, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015965440543368459}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.10275707393884659, 'actor/pg_clipfrac': 0.006015037652105093, 'actor/ppo_kl': 0.0005422118701972067}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0001879527699202299, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005319998599588871}
[36m(Runner pid=3309020)[0m Step 22
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.255
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.024
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: -3.393992614526553e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.024
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.024
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.63
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.63
[36m(Runner pid=3309020)[0m min: 0.0
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 962327
[36m(Runner pid=3309020)[0m balanced_min: 962107
[36m(Runner pid=3309020)[0m max: 962704
[36m(Runner pid=3309020)[0m mean: 962217.0
[36m(Runner pid=3309020)[0m min: 961730
[36m(Runner pid=3309020)[0m minmax_diff: 974
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 109.851
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.118
[36m(Runner pid=3309020)[0m throughput: 1130.445
[36m(Runner pid=3309020)[0m time_per_step: 851.184
[36m(Runner pid=3309020)[0m total_num_tokens: 1924434
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 668.0
[36m(Runner pid=3309020)[0m mean: 465.637
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1988.0
[36m(Runner pid=3309020)[0m mean: 286.095
[36m(Runner pid=3309020)[0m min: 53.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.266
[36m(Runner pid=3309020)[0m format: 0.993
[36m(Runner pid=3309020)[0m overall: 0.63
[36m(Runner pid=3309020)[0m tag_reward: 0.997
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.033289893159189e-05
[36m(Runner pid=3309020)[0m gen: 0.144
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.293
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.155
[36m(Runner pid=3309020)[0m gen: 105.271
[36m(Runner pid=3309020)[0m old: 84.417
[36m(Runner pid=3309020)[0m ref: 90.867
[36m(Runner pid=3309020)[0m reward: 6.901
[36m(Runner pid=3309020)[0m step: 851.184
[36m(Runner pid=3309020)[0m update_actor: 562.967
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 23; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 02:58:35 [executor_base.py:219] It took 0.339676 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:00:00 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 02:58:35 [executor_base.py:219] It took 0.339830 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:00:00 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:00:00 [executor_base.py:208] It took 0.327190 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:00:00 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:00:00 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:00:00 [executor_base.py:208] It took 0.326511 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00015621860802639276, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00031085603404790163, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0022177936043590307}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.20731130242347717, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00024708607816137373, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.38343745470046997, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.19819976389408112, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00021713633032049984}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00017207270138897002, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012629827251657844}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.25251227617263794, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00014107968308962882, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005105208256281912}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002447575388941914, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.21366167068481445, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00023345067165791988, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.29989901185035706, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.04449938237667084, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3386271893978119, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.31290486454963684, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.41745659708976746, 'actor/pg_clipfrac': 0.0028571428265422583, 'actor/ppo_kl': -0.0015331649919971824}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.18423746526241302, 'actor/pg_clipfrac': 0.0022953327279537916, 'actor/ppo_kl': -0.0004410864203236997}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.37223386764526367, 'actor/pg_clipfrac': 0.000621890532784164, 'actor/ppo_kl': -0.0012522395700216293}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4831998348236084, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008700350299477577}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.17787443101406097, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001059356378391385}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.7477173209190369, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005748315015807748}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.08461103588342667, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005367306293919683}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0001342334144283086, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010972564341500401}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.10973085463047028, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00030313825118355453}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.08534185588359833, 'actor/pg_clipfrac': 0.0012210012646391988, 'actor/ppo_kl': -4.879691914538853e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.09590007364749908, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -5.4212646318774205e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00017474415653850883, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005697490414604545}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.4866936504840851, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0020966441370546818}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00015369401080533862, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009759148815646768}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00018525538325775415, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009769626194611192}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00013057759497314692, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002855097991414368}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.457506000995636, 'actor/pg_clipfrac': 0.0007923930534161627, 'actor/ppo_kl': -0.0008225917117670178}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002706055238377303, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0033487945329397917}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3584040403366089, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002072117058560252}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00022602560056839138, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012568995589390397}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.9928563833236694, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002504419069737196}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.161159947514534, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00027706281980499625}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00012600918125826865, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005638535949401557}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1498778611421585, 'actor/pg_clipfrac': 0.0014409221475943923, 'actor/ppo_kl': -0.0027231250423938036}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.010947361588478088, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017671660752967}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002404390397714451, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011684789787977934}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.19854319095611572, 'actor/pg_clipfrac': 0.0009199631749652326, 'actor/ppo_kl': -0.00023130506451707333}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.06080592796206474, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010562896495684981}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.20279639959335327, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015848272014409304}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.21377205848693848, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014449270674958825}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.08529547601938248, 'actor/pg_clipfrac': 0.0008417508215643466, 'actor/ppo_kl': -0.0009482444729655981}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.40209582448005676, 'actor/pg_clipfrac': 0.0025929126422852278, 'actor/ppo_kl': 0.0006711180321872234}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.4677596688270569, 'actor/pg_clipfrac': 0.005322687793523073, 'actor/ppo_kl': -0.0004983446560800076}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.5615394115447998, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006884301546961069}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.06361276656389236, 'actor/pg_clipfrac': 0.0008467400330118835, 'actor/ppo_kl': -8.402347157243639e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.1886124163866043, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00014724789070896804}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.22876523435115814, 'actor/pg_clipfrac': 0.0010204081190750003, 'actor/ppo_kl': 0.001022138400003314}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:14<1:03:22, 2.98s/it, est. speed input: 146.20 toks/s, output: 20.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:24<49:27, 2.34s/it, est. speed input: 203.60 toks/s, output: 35.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<30:25, 1.44s/it, est. speed input: 271.22 toks/s, output: 54.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:26<13:50, 1.51it/s, est. speed input: 438.70 toks/s, output: 96.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:28<11:16, 1.85it/s, est. speed input: 501.88 toks/s, output: 111.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:29<09:50, 2.11it/s, est. speed input: 552.62 toks/s, output: 128.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:30<08:16, 2.50it/s, est. speed input: 601.04 toks/s, output: 143.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:31<06:32, 3.15it/s, est. speed input: 657.24 toks/s, output: 161.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:31<04:51, 4.22it/s, est. speed input: 729.21 toks/s, output: 181.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:31<03:46, 5.41it/s, est. speed input: 789.91 toks/s, output: 203.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:32<01:58, 10.18it/s, est. speed input: 983.55 toks/s, output: 257.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:32<01:40, 11.93it/s, est. speed input: 1046.43 toks/s, output: 281.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:20, 14.86it/s, est. speed input: 1225.29 toks/s, output: 337.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:33<01:27, 13.47it/s, est. speed input: 1276.44 toks/s, output: 353.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:34<01:03, 18.48it/s, est. speed input: 1458.59 toks/s, output: 416.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:34<01:03, 18.37it/s, est. speed input: 1513.75 toks/s, output: 430.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:35<01:08, 16.97it/s, est. speed input: 1610.77 toks/s, output: 464.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:35<00:43, 26.17it/s, est. speed input: 1803.53 toks/s, output: 538.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:35<00:39, 28.68it/s, est. speed input: 1922.18 toks/s, output: 587.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:35<00:42, 26.41it/s, est. speed input: 1972.13 toks/s, output: 606.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:35<00:33, 33.55it/s, est. speed input: 2098.38 toks/s, output: 651.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:36<00:31, 34.80it/s, est. speed input: 2149.72 toks/s, output: 674.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:36<00:30, 35.98it/s, est. speed input: 2208.55 toks/s, output: 696.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:36<00:29, 37.22it/s, est. speed input: 2265.35 toks/s, output: 722.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:36<00:28, 38.28it/s, est. speed input: 2323.53 toks/s, output: 745.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:36<00:18, 59.84it/s, est. speed input: 2502.22 toks/s, output: 815.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:36<00:18, 58.80it/s, est. speed input: 2616.72 toks/s, output: 857.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:36<00:13, 75.66it/s, est. speed input: 2799.32 toks/s, output: 922.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:37<00:18, 56.72it/s, est. speed input: 2902.30 toks/s, output: 963.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:37<00:19, 52.44it/s, est. speed input: 3006.00 toks/s, output: 1006.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:37<00:20, 49.37it/s, est. speed input: 3166.15 toks/s, output: 1074.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:38<00:24, 40.44it/s, est. speed input: 3257.65 toks/s, output: 1104.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:38<00:18, 54.74it/s, est. speed input: 3429.30 toks/s, output: 1181.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:38<00:17, 57.07it/s, est. speed input: 3531.69 toks/s, output: 1230.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:38<00:15, 64.04it/s, est. speed input: 3644.45 toks/s, output: 1281.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:38<00:14, 67.38it/s, est. speed input: 3797.80 toks/s, output: 1348.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:38<00:14, 67.22it/s, est. speed input: 3900.10 toks/s, output: 1403.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:39<00:16, 57.18it/s, est. speed input: 3996.64 toks/s, output: 1446.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:39<00:09, 93.45it/s, est. speed input: 4332.75 toks/s, output: 1596.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:39<00:08, 105.17it/s, est. speed input: 4553.42 toks/s, output: 1690.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:39<00:11, 74.20it/s, est. speed input: 4689.06 toks/s, output: 1749.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:40<00:18, 47.62it/s, est. speed input: 4747.11 toks/s, output: 1782.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:40<00:18, 45.51it/s, est. speed input: 4833.97 toks/s, output: 1831.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:40<00:14, 56.91it/s, est. speed input: 4996.11 toks/s, output: 1907.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:40<00:12, 68.53it/s, est. speed input: 5147.75 toks/s, output: 1989.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:40<00:08, 96.01it/s, est. speed input: 5426.36 toks/s, output: 2115.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:40<00:07, 102.71it/s, est. speed input: 5570.12 toks/s, output: 2192.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:41<00:08, 87.48it/s, est. speed input: 5702.10 toks/s, output: 2262.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:41<00:07, 98.36it/s, est. speed input: 5904.89 toks/s, output: 2375.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:41<00:07, 92.11it/s, est. speed input: 6044.72 toks/s, output: 2443.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:41<00:05, 120.41it/s, est. speed input: 6304.00 toks/s, output: 2573.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:41<00:05, 125.72it/s, est. speed input: 6457.46 toks/s, output: 2653.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:41<00:05, 131.03it/s, est. speed input: 6606.03 toks/s, output: 2711.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:42<00:08, 78.05it/s, est. speed input: 6712.52 toks/s, output: 2749.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:42<00:06, 94.04it/s, est. speed input: 6916.98 toks/s, output: 2850.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:42<00:09, 69.65it/s, est. speed input: 7023.96 toks/s, output: 2916.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:43<00:10, 57.11it/s, est. speed input: 7087.78 toks/s, output: 2963.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:43<00:11, 53.28it/s, est. speed input: 7157.70 toks/s, output: 3011.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:43<00:06, 87.64it/s, est. speed input: 7460.32 toks/s, output: 3163.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:43<00:07, 71.48it/s, est. speed input: 7565.70 toks/s, output: 3234.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:43<00:06, 88.66it/s, est. speed input: 7754.60 toks/s, output: 3339.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:44<00:06, 78.36it/s, est. speed input: 7860.85 toks/s, output: 3405.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:44<00:05, 88.83it/s, est. speed input: 8002.77 toks/s, output: 3480.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:44<00:04, 116.89it/s, est. speed input: 8249.32 toks/s, output: 3625.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:44<00:03, 122.82it/s, est. speed input: 8380.82 toks/s, output: 3703.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:44<00:02, 151.13it/s, est. speed input: 8626.75 toks/s, output: 3843.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:44<00:03, 137.09it/s, est. speed input: 8805.91 toks/s, output: 3953.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:44<00:03, 119.47it/s, est. speed input: 8973.17 toks/s, output: 4062.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:45<00:04, 98.18it/s, est. speed input: 9077.58 toks/s, output: 4123.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:45<00:02, 136.63it/s, est. speed input: 9411.13 toks/s, output: 4322.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:45<00:03, 100.89it/s, est. speed input: 9539.70 toks/s, output: 4436.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:45<00:03, 98.19it/s, est. speed input: 9656.92 toks/s, output: 4535.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:45<00:03, 102.59it/s, est. speed input: 9774.48 toks/s, output: 4638.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:46<00:02, 110.53it/s, est. speed input: 9934.29 toks/s, output: 4758.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:46<00:01, 142.11it/s, est. speed input: 10227.87 toks/s, output: 4978.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:46<00:01, 130.08it/s, est. speed input: 10397.33 toks/s, output: 5133.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:46<00:01, 132.97it/s, est. speed input: 10522.68 toks/s, output: 5231.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:46<00:01, 168.89it/s, est. speed input: 10843.11 toks/s, output: 5468.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:46<00:01, 119.68it/s, est. speed input: 10997.57 toks/s, output: 5607.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:46<00:01, 119.59it/s, est. speed input: 11116.78 toks/s, output: 5716.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:47<00:01, 98.97it/s, est. speed input: 11199.77 toks/s, output: 5812.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:47<00:01, 96.41it/s, est. speed input: 11305.47 toks/s, output: 5914.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:47<00:01, 87.39it/s, est. speed input: 11479.06 toks/s, output: 6089.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:47<00:00, 93.05it/s, est. speed input: 11590.14 toks/s, output: 6206.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:47<00:00, 101.61it/s, est. speed input: 11717.63 toks/s, output: 6334.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:48<00:00, 74.31it/s, est. speed input: 11774.03 toks/s, output: 6409.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:48<00:00, 58.64it/s, est. speed input: 11798.87 toks/s, output: 6473.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:48<00:00, 60.33it/s, est. speed input: 11857.57 toks/s, output: 6564.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:49<00:00, 36.33it/s, est. speed input: 11798.81 toks/s, output: 6557.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:49<00:00, 32.55it/s, est. speed input: 11795.51 toks/s, output: 6598.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:51<00:00, 14.99it/s, est. speed input: 11510.24 toks/s, output: 6483.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:08<00:00, 14.99it/s, est. speed input: 11510.24 toks/s, output: 6483.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:12<00:00, 1.37it/s, est. speed input: 8219.67 toks/s, output: 4692.41 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:12<00:00, 17.71it/s, est. speed input: 8219.67 toks/s, output: 4692.41 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0002895272627938539, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003440885338932276}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4349021315574646, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00031215036869980395}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3929098844528198, 'actor/pg_clipfrac': 0.0008673027041368186, 'actor/ppo_kl': -0.0018358586821705103}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00018448656192049384, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001009022380458191}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.4322224259376526, 'actor/pg_clipfrac': 0.0011695906287059188, 'actor/ppo_kl': -0.0003940448514185846}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.004712417256087065, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00030182115733623505}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.485598623752594, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00027291756123304367}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.3143955171108246, 'actor/pg_clipfrac': 0.0017605633474886417, 'actor/ppo_kl': -0.0008228804799728096}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.30820828676223755, 'actor/pg_clipfrac': 0.0010131712770089507, 'actor/ppo_kl': 0.001077294466085732}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.08296129107475281, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006454529357142746}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.1374129354953766, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00038243577000685036}
[36m(Runner pid=3309020)[0m Step 23
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.267
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.023
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.006
[36m(Runner pid=3309020)[0m ppo_kl: -7.626135279181767e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.009
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.009
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.626
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.626
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 954802
[36m(Runner pid=3309020)[0m balanced_min: 954802
[36m(Runner pid=3309020)[0m max: 963312
[36m(Runner pid=3309020)[0m mean: 954802.0
[36m(Runner pid=3309020)[0m min: 946292
[36m(Runner pid=3309020)[0m minmax_diff: 17020
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 109.755
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.117
[36m(Runner pid=3309020)[0m throughput: 1132.089
[36m(Runner pid=3309020)[0m time_per_step: 843.398
[36m(Runner pid=3309020)[0m total_num_tokens: 1909604
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 687.0
[36m(Runner pid=3309020)[0m mean: 467.561
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1232.0
[36m(Runner pid=3309020)[0m mean: 278.379
[36m(Runner pid=3309020)[0m min: 54.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.254
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.626
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.014038831966973e-05
[36m(Runner pid=3309020)[0m gen: 0.141
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.294
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.134
[36m(Runner pid=3309020)[0m gen: 100.645
[36m(Runner pid=3309020)[0m old: 86.58
[36m(Runner pid=3309020)[0m ref: 87.744
[36m(Runner pid=3309020)[0m reward: 5.69
[36m(Runner pid=3309020)[0m step: 843.398
[36m(Runner pid=3309020)[0m update_actor: 561.971
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 24; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:12:43 [executor_base.py:219] It took 0.340540 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:14:07 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:12:43 [executor_base.py:219] It took 0.339691 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:14:07 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:14:07 [executor_base.py:208] It took 0.326963 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:14:29 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:14:29 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:14:29 [executor_base.py:208] It took 0.326248 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.5850660800933838, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001250488421646878}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.2655371129512787, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.08486057072877884, 'actor/pg_clipfrac': 0.0021344716660678387, 'actor/ppo_kl': -0.00023336827871389687}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4306638836860657, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.10073108226060867, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0002835199993569404, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015998691087588668}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.24104037880897522, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008508046739734709}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.34478551149368286, 'actor/pg_clipfrac': 0.0015243901871144772, 'actor/ppo_kl': 0.0016256396193057299}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.20909011363983154, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.475414901971817, 'actor/pg_clipfrac': 0.0008403361425735056, 'actor/ppo_kl': -0.0003082836337853223}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.33004528284072876, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.021127834916114807, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00021110783563926816}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.09459500759840012, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.2425222396850586, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.139521986246109, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4770323932170868, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.21404021978378296, 'actor/pg_clipfrac': 0.0019907099194824696, 'actor/ppo_kl': -0.0012576072476804256}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2285672128200531, 'actor/pg_clipfrac': 0.0011013215407729149, 'actor/ppo_kl': 0.0005566748441196978}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00018019396520685405, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010403679916635156}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.41491127014160156, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007501032087020576}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.20398938655853271, 'actor/pg_clipfrac': 0.0021459227427840233, 'actor/ppo_kl': -0.00034160574432462454}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.41322699189186096, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014183820458129048}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00020032026804983616, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000656543648801744}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.06593558937311172, 'actor/pg_clipfrac': 0.0011799409985542297, 'actor/ppo_kl': -0.0002165667829103768}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00014970137272030115, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003351413761265576}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.12790101766586304, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0024259022902697325}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.33082225918769836, 'actor/pg_clipfrac': 0.0018621974159032106, 'actor/ppo_kl': 0.0007877740426920354}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00010582091636024415, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -9.235175821231678e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00024685077369213104, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00023016476188786328}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 7.681347778998315e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -5.4697899031452835e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.4475724995136261, 'actor/pg_clipfrac': 0.0017064845887944102, 'actor/ppo_kl': 0.00018960543093271554}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.5196011066436768, 'actor/pg_clipfrac': 0.0012531328247860074, 'actor/ppo_kl': 0.0007299026474356651}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.6616190075874329, 'actor/pg_clipfrac': 0.0024721878580749035, 'actor/ppo_kl': -0.0004141822864767164}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.2177765965461731, 'actor/pg_clipfrac': 0.001636661239899695, 'actor/ppo_kl': 0.0007258757250383496}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3588712215423584, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006217780755832791}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00015030622307676822, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006492156535387039}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.4478874206542969, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007399879395961761}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.2695246636867523, 'actor/pg_clipfrac': 0.0006770480540581048, 'actor/ppo_kl': -0.00021909199131187052}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0001951289304997772, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006205276004038751}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00019554788013920188, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003269268199801445}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.006276588886976242, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014431752497330308}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.591896116733551, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015251756412908435}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.03856110945343971, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00022423319751396775}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.28354597091674805, 'actor/pg_clipfrac': 0.0007980845985002816, 'actor/ppo_kl': -0.00018105503113474697}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.21695373952388763, 'actor/pg_clipfrac': 0.003445305861532688, 'actor/ppo_kl': -0.0011228122748434544}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.014478412456810474, 'actor/pg_clipfrac': 0.0010152284521609545, 'actor/ppo_kl': -0.0011759975459426641}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3762030601501465, 'actor/pg_clipfrac': 0.0010706637986004353, 'actor/ppo_kl': 0.0020626464392989874}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:23<1:39:20, 4.68s/it, est. speed input: 99.03 toks/s, output: 22.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<42:11, 1.99s/it, est. speed input: 188.66 toks/s, output: 43.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:24<24:26, 1.16s/it, est. speed input: 272.11 toks/s, output: 66.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:25<15:41, 1.34it/s, est. speed input: 356.88 toks/s, output: 88.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:26<11:31, 1.82it/s, est. speed input: 430.26 toks/s, output: 106.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:28<10:30, 1.98it/s, est. speed input: 482.63 toks/s, output: 121.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:29<07:50, 2.64it/s, est. speed input: 550.36 toks/s, output: 141.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:29<06:10, 3.34it/s, est. speed input: 615.68 toks/s, output: 157.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:30<04:43, 4.35it/s, est. speed input: 686.25 toks/s, output: 178.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:30<03:33, 5.75it/s, est. speed input: 755.75 toks/s, output: 196.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:32<03:32, 5.74it/s, est. speed input: 857.89 toks/s, output: 224.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:32<01:50, 10.87it/s, est. speed input: 1068.96 toks/s, output: 287.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:32<01:36, 12.49it/s, est. speed input: 1136.60 toks/s, output: 307.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:32<01:30, 13.23it/s, est. speed input: 1195.89 toks/s, output: 328.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:33, 12.70it/s, est. speed input: 1253.55 toks/s, output: 348.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:33<01:02, 18.89it/s, est. speed input: 1385.74 toks/s, output: 393.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:33<01:10, 16.69it/s, est. speed input: 1437.32 toks/s, output: 409.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:34<01:12, 16.04it/s, est. speed input: 1492.33 toks/s, output: 429.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:34<00:49, 23.52it/s, est. speed input: 1625.62 toks/s, output: 472.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:34<00:30, 37.04it/s, est. speed input: 1872.27 toks/s, output: 558.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:34<00:33, 34.22it/s, est. speed input: 1927.20 toks/s, output: 579.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:35<00:52, 21.58it/s, est. speed input: 1960.78 toks/s, output: 592.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:35<00:35, 31.67it/s, est. speed input: 2138.70 toks/s, output: 656.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:35<00:43, 25.55it/s, est. speed input: 2175.85 toks/s, output: 669.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:35<00:33, 33.18it/s, est. speed input: 2292.59 toks/s, output: 715.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:36<00:26, 41.18it/s, est. speed input: 2414.90 toks/s, output: 766.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:36<00:41, 25.84it/s, est. speed input: 2495.00 toks/s, output: 810.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:36<00:24, 43.30it/s, est. speed input: 2735.60 toks/s, output: 918.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:37<00:22, 46.33it/s, est. speed input: 2848.61 toks/s, output: 954.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:37<00:18, 55.74it/s, est. speed input: 3025.97 toks/s, output: 1019.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:37<00:14, 69.41it/s, est. speed input: 3200.67 toks/s, output: 1088.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:37<00:21, 46.67it/s, est. speed input: 3288.02 toks/s, output: 1123.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:37<00:13, 74.49it/s, est. speed input: 3590.54 toks/s, output: 1253.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:38<00:13, 73.99it/s, est. speed input: 3754.97 toks/s, output: 1325.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:38<00:12, 74.02it/s, est. speed input: 3911.61 toks/s, output: 1395.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:38<00:11, 81.53it/s, est. speed input: 4124.03 toks/s, output: 1495.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:38<00:11, 79.76it/s, est. speed input: 4281.52 toks/s, output: 1570.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:38<00:12, 71.37it/s, est. speed input: 4380.57 toks/s, output: 1620.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:39<00:12, 70.41it/s, est. speed input: 4582.13 toks/s, output: 1726.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:39<00:13, 64.01it/s, est. speed input: 4726.67 toks/s, output: 1791.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:39<00:13, 65.17it/s, est. speed input: 4831.96 toks/s, output: 1838.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:39<00:12, 68.99it/s, est. speed input: 5022.14 toks/s, output: 1937.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:39<00:11, 70.81it/s, est. speed input: 5123.68 toks/s, output: 1985.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:40<00:13, 63.45it/s, est. speed input: 5212.31 toks/s, output: 2029.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:40<00:12, 67.03it/s, est. speed input: 5313.75 toks/s, output: 2078.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:40<00:10, 79.97it/s, est. speed input: 5468.38 toks/s, output: 2143.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:40<00:09, 81.02it/s, est. speed input: 5562.99 toks/s, output: 2196.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:40<00:09, 85.35it/s, est. speed input: 5712.84 toks/s, output: 2280.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:40<00:09, 78.48it/s, est. speed input: 5827.48 toks/s, output: 2320.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:41<00:10, 69.39it/s, est. speed input: 5918.49 toks/s, output: 2367.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:41<00:09, 77.57it/s, est. speed input: 6065.55 toks/s, output: 2447.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:41<00:08, 81.22it/s, est. speed input: 6163.82 toks/s, output: 2496.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:41<00:05, 118.54it/s, est. speed input: 6418.88 toks/s, output: 2616.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:41<00:05, 115.99it/s, est. speed input: 6566.29 toks/s, output: 2696.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:41<00:06, 106.55it/s, est. speed input: 6713.77 toks/s, output: 2772.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:41<00:05, 119.53it/s, est. speed input: 6916.16 toks/s, output: 2888.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:42<00:05, 121.68it/s, est. speed input: 7106.39 toks/s, output: 2986.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:42<00:05, 121.54it/s, est. speed input: 7251.98 toks/s, output: 3054.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:42<00:06, 91.25it/s, est. speed input: 7369.32 toks/s, output: 3116.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:42<00:08, 69.58it/s, est. speed input: 7476.02 toks/s, output: 3184.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:42<00:05, 96.93it/s, est. speed input: 7732.96 toks/s, output: 3339.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:42<00:05, 102.62it/s, est. speed input: 7867.36 toks/s, output: 3412.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:43<00:05, 94.97it/s, est. speed input: 7994.26 toks/s, output: 3465.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:43<00:04, 125.57it/s, est. speed input: 8291.82 toks/s, output: 3634.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:43<00:04, 105.22it/s, est. speed input: 8409.53 toks/s, output: 3715.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:43<00:04, 113.51it/s, est. speed input: 8642.98 toks/s, output: 3876.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:43<00:04, 104.57it/s, est. speed input: 8766.64 toks/s, output: 3966.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:44<00:04, 105.62it/s, est. speed input: 8898.50 toks/s, output: 4065.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:44<00:04, 102.51it/s, est. speed input: 9022.08 toks/s, output: 4153.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:44<00:03, 103.90it/s, est. speed input: 9157.61 toks/s, output: 4254.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:44<00:03, 104.02it/s, est. speed input: 9328.47 toks/s, output: 4365.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:44<00:04, 89.66it/s, est. speed input: 9437.90 toks/s, output: 4437.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:44<00:04, 78.72it/s, est. speed input: 9501.32 toks/s, output: 4513.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:45<00:03, 103.93it/s, est. speed input: 9725.57 toks/s, output: 4667.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:45<00:02, 110.30it/s, est. speed input: 9852.84 toks/s, output: 4775.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:45<00:02, 116.50it/s, est. speed input: 9976.21 toks/s, output: 4863.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:45<00:02, 133.70it/s, est. speed input: 10161.60 toks/s, output: 4991.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:45<00:02, 95.42it/s, est. speed input: 10255.38 toks/s, output: 5088.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:45<00:02, 88.40it/s, est. speed input: 10359.68 toks/s, output: 5170.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:46<00:01, 123.28it/s, est. speed input: 10627.69 toks/s, output: 5395.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:46<00:01, 119.95it/s, est. speed input: 10831.58 toks/s, output: 5571.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:46<00:01, 101.69it/s, est. speed input: 11016.18 toks/s, output: 5710.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:46<00:01, 98.45it/s, est. speed input: 11123.78 toks/s, output: 5807.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:46<00:01, 83.28it/s, est. speed input: 11215.96 toks/s, output: 5922.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:47<00:01, 94.45it/s, est. speed input: 11378.81 toks/s, output: 6048.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:47<00:01, 89.64it/s, est. speed input: 11526.36 toks/s, output: 6184.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:47<00:00, 96.35it/s, est. speed input: 11656.02 toks/s, output: 6297.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:47<00:00, 108.48it/s, est. speed input: 11828.99 toks/s, output: 6489.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:47<00:00, 84.90it/s, est. speed input: 11896.51 toks/s, output: 6587.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:48<00:00, 61.36it/s, est. speed input: 11907.99 toks/s, output: 6619.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:48<00:00, 53.76it/s, est. speed input: 11934.90 toks/s, output: 6663.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:48<00:00, 42.63it/s, est. speed input: 11937.48 toks/s, output: 6714.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:49<00:00, 42.36it/s, est. speed input: 11973.96 toks/s, output: 6790.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:49<00:00, 24.01it/s, est. speed input: 11854.51 toks/s, output: 6749.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:50<00:00, 25.62it/s, est. speed input: 11866.87 toks/s, output: 6782.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:51<00:00, 11.85it/s, est. speed input: 11606.55 toks/s, output: 6657.63 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:51<00:00, 24.94it/s, est. speed input: 11606.55 toks/s, output: 6657.63 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.16273000836372375, 'actor/pg_clipfrac': 0.001088139251805842, 'actor/ppo_kl': -0.0008317907922901213}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00017414544709026814, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016150978626683354}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.49891987442970276, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007340691518038511}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.48207321763038635, 'actor/pg_clipfrac': 0.0009832842042669654, 'actor/ppo_kl': -0.001184185966849327}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.26990029215812683, 'actor/pg_clipfrac': 0.0015015015378594398, 'actor/ppo_kl': 0.0008245101780630648}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.04812624305486679, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009765496361069381}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.11661869287490845, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011416897177696228}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00027704742387868464, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00106561288703233}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00015896704280748963, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005700521287508309}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0001141674947575666, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012097080470994115}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.23513302206993103, 'actor/pg_clipfrac': 0.0005040322430431843, 'actor/ppo_kl': -0.0001116390194511041}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.2576265335083008, 'actor/pg_clipfrac': 0.000834724516607821, 'actor/ppo_kl': 0.00016409088857471943}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2818357050418854, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003472477837931365}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.23172467947006226, 'actor/pg_clipfrac': 0.0024232633877545595, 'actor/ppo_kl': 0.0005306404200382531}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00018817813543137163, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00017185756587423384}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.09086042642593384, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002010389231145382}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0003083542105741799, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00380434631370008}
[36m(Runner pid=3309020)[0m Step 24
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.305
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.018
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: 2.276680499271322e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.019
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.019
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.63
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.63
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 948636
[36m(Runner pid=3309020)[0m balanced_min: 946478
[36m(Runner pid=3309020)[0m max: 961637
[36m(Runner pid=3309020)[0m mean: 947557.0
[36m(Runner pid=3309020)[0m min: 933477
[36m(Runner pid=3309020)[0m minmax_diff: 28160
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 102.619
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.116
[36m(Runner pid=3309020)[0m throughput: 1094.984
[36m(Runner pid=3309020)[0m time_per_step: 865.361
[36m(Runner pid=3309020)[0m total_num_tokens: 1895114
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 695.0
[36m(Runner pid=3309020)[0m mean: 466.973
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3913.0
[36m(Runner pid=3309020)[0m mean: 273.306
[36m(Runner pid=3309020)[0m min: 54.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.263
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.63
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.664780270084748e-05
[36m(Runner pid=3309020)[0m gen: 0.175
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.297
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.164
[36m(Runner pid=3309020)[0m gen: 122.584
[36m(Runner pid=3309020)[0m old: 85.563
[36m(Runner pid=3309020)[0m ref: 86.835
[36m(Runner pid=3309020)[0m reward: 6.862
[36m(Runner pid=3309020)[0m step: 865.361
[36m(Runner pid=3309020)[0m update_actor: 562.726
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 25; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:27:11 [executor_base.py:219] It took 0.341058 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:28:35 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:27:11 [executor_base.py:219] It took 0.340895 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:28:36 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:28:36 [executor_base.py:208] It took 0.327786 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:28:37 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:28:37 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:28:37 [executor_base.py:208] It took 0.326044 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.3795608878135681, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.07221776247024536, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.000195080297999084, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011080290423706174}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.29412582516670227, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.4431972801685333, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.0966937243938446, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2718336880207062, 'actor/pg_clipfrac': 0.0014204545877873898, 'actor/ppo_kl': -0.0007509778952226043}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00010336290870327502, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.13615435361862183, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00017341130296699703, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3820400834083557, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00015701918164268136, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010511246509850025}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.24460755288600922, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00022105373500380665, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.000261413719272241, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004869357799179852}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.11574819684028625, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.014683417975902557, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004921481595374644}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.32077813148498535, 'actor/pg_clipfrac': 0.0006915629492141306, 'actor/ppo_kl': 0.001007928280159831}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0629676803946495, 'actor/pg_clipfrac': 0.005454545374959707, 'actor/ppo_kl': -0.002140332944691181}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.1936601996421814, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006698936922475696}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002392762980889529, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004938793135806918}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.12208018451929092, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010025579249486327}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00010147474677069113, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -6.386134919011965e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.07303816825151443, 'actor/pg_clipfrac': 0.0007961783558130264, 'actor/ppo_kl': -2.72647594101727e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00018345590797252953, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006732475594617426}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.03393939509987831, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012048714561387897}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.708824872970581, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001041412353515625}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00016985477122943848, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001202977611683309}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.08354239910840988, 'actor/pg_clipfrac': 0.0018744142726063728, 'actor/ppo_kl': -0.0019132011802867055}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.24696944653987885, 'actor/pg_clipfrac': 0.003225806402042508, 'actor/ppo_kl': 0.0009955159621313214}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.06627482175827026, 'actor/pg_clipfrac': 0.0010172940092161298, 'actor/ppo_kl': 0.0002243841445306316}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4750719666481018, 'actor/pg_clipfrac': 0.0020000000949949026, 'actor/ppo_kl': 0.00025019454187713563}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.4040336012840271, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003381043206900358}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.04615955799818039, 'actor/pg_clipfrac': 0.0015910898800939322, 'actor/ppo_kl': -7.101952360244468e-05}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3153645992279053, 'actor/pg_clipfrac': 0.0014771048445254564, 'actor/ppo_kl': -0.0015726526034995914}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00014691280375700444, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010735385585576296}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.09523086994886398, 'actor/pg_clipfrac': 0.0005455537466332316, 'actor/ppo_kl': -0.0013944471720606089}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0002121475845342502, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009585445513948798}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.02928067371249199, 'actor/pg_clipfrac': 0.001251564477570355, 'actor/ppo_kl': 0.0002391204034211114}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.18656040728092194, 'actor/pg_clipfrac': 0.0007867820677347481, 'actor/ppo_kl': 0.00034636311465874314}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.7050248980522156, 'actor/pg_clipfrac': 0.0017123287543654442, 'actor/ppo_kl': 0.0008575067040510476}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00021800491958856583, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000356873293640092}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:14<1:33:23, 14.90s/it, est. speed input: 30.66 toks/s, output: 4.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<39:12, 6.27s/it, est. speed input: 60.45 toks/s, output: 10.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:15<22:11, 3.56s/it, est. speed input: 89.42 toks/s, output: 15.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%|▏ | 5/377 [00:15<09:48, 1.58s/it, est. speed input: 147.84 toks/s, output: 27.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 6/377 [00:15<07:39, 1.24s/it, est. speed input: 174.75 toks/s, output: 33.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 8/377 [00:16<04:24, 1.39it/s, est. speed input: 230.66 toks/s, output: 45.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 10/377 [00:16<02:49, 2.17it/s, est. speed input: 285.53 toks/s, output: 59.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 13/377 [00:16<01:40, 3.63it/s, est. speed input: 366.25 toks/s, output: 78.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 15/377 [00:16<01:20, 4.51it/s, est. speed input: 417.78 toks/s, output: 91.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 22/377 [00:16<00:35, 10.07it/s, est. speed input: 605.08 toks/s, output: 142.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 26/377 [00:16<00:27, 12.96it/s, est. speed input: 710.08 toks/s, output: 172.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 34/377 [00:17<00:16, 20.27it/s, est. speed input: 916.86 toks/s, output: 233.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 38/377 [00:17<00:15, 22.39it/s, est. speed input: 1016.82 toks/s, output: 264.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 43/377 [00:17<00:13, 24.72it/s, est. speed input: 1142.29 toks/s, output: 302.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 50/377 [00:17<00:10, 29.99it/s, est. speed input: 1316.90 toks/s, output: 358.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 55/377 [00:17<00:09, 32.37it/s, est. speed input: 1440.66 toks/s, output: 399.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 61/377 [00:17<00:08, 36.50it/s, est. speed input: 1590.08 toks/s, output: 447.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 68/377 [00:17<00:07, 42.04it/s, est. speed input: 1763.77 toks/s, output: 505.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 73/377 [00:17<00:07, 42.04it/s, est. speed input: 1880.20 toks/s, output: 548.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 80/377 [00:18<00:06, 46.63it/s, est. speed input: 2045.86 toks/s, output: 608.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 85/377 [00:18<00:06, 45.49it/s, est. speed input: 2159.91 toks/s, output: 650.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 91/377 [00:18<00:06, 47.25it/s, est. speed input: 2298.78 toks/s, output: 702.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 96/377 [00:18<00:07, 36.04it/s, est. speed input: 2394.60 toks/s, output: 742.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 103/377 [00:18<00:06, 42.10it/s, est. speed input: 2553.13 toks/s, output: 809.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 108/377 [00:18<00:07, 35.82it/s, est. speed input: 2649.98 toks/s, output: 851.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 113/377 [00:19<00:08, 30.67it/s, est. speed input: 2751.96 toks/s, output: 894.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 120/377 [00:19<00:06, 37.41it/s, est. speed input: 2905.41 toks/s, output: 965.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 127/377 [00:19<00:05, 44.30it/s, est. speed input: 3057.69 toks/s, output: 1036.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 138/377 [00:19<00:04, 57.12it/s, est. speed input: 3304.12 toks/s, output: 1149.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 147/377 [00:19<00:03, 62.56it/s, est. speed input: 3504.73 toks/s, output: 1242.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 154/377 [00:19<00:04, 53.25it/s, est. speed input: 3640.06 toks/s, output: 1312.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 161/377 [00:19<00:03, 55.27it/s, est. speed input: 3784.20 toks/s, output: 1386.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 175/377 [00:19<00:02, 70.25it/s, est. speed input: 4093.97 toks/s, output: 1542.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▊ | 183/377 [00:20<00:02, 71.83it/s, est. speed input: 4257.24 toks/s, output: 1629.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 191/377 [00:20<00:02, 67.00it/s, est. speed input: 4415.78 toks/s, output: 1715.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 201/377 [00:20<00:02, 68.87it/s, est. speed input: 4616.62 toks/s, output: 1828.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 211/377 [00:20<00:02, 73.59it/s, est. speed input: 4819.77 toks/s, output: 1945.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 219/377 [00:20<00:02, 54.44it/s, est. speed input: 4942.15 toks/s, output: 2025.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 226/377 [00:20<00:02, 54.03it/s, est. speed input: 5074.54 toks/s, output: 2107.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 235/377 [00:20<00:02, 60.00it/s, est. speed input: 5249.56 toks/s, output: 2219.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 244/377 [00:21<00:02, 65.34it/s, est. speed input: 5423.89 toks/s, output: 2332.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 252/377 [00:21<00:01, 63.27it/s, est. speed input: 5566.94 toks/s, output: 2428.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▊ | 259/377 [00:21<00:02, 55.16it/s, est. speed input: 5679.76 toks/s, output: 2509.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 268/377 [00:21<00:01, 60.20it/s, est. speed input: 5850.49 toks/s, output: 2629.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 275/377 [00:21<00:01, 57.09it/s, est. speed input: 5966.91 toks/s, output: 2718.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 281/377 [00:21<00:01, 56.60it/s, est. speed input: 6068.65 toks/s, output: 2798.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 290/377 [00:21<00:01, 61.74it/s, est. speed input: 6233.15 toks/s, output: 2925.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 299/377 [00:21<00:01, 68.09it/s, est. speed input: 6396.09 toks/s, output: 3055.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 306/377 [00:22<00:01, 60.14it/s, est. speed input: 6505.19 toks/s, output: 3152.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 315/377 [00:22<00:00, 65.79it/s, est. speed input: 6667.52 toks/s, output: 3287.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 322/377 [00:22<00:00, 65.93it/s, est. speed input: 6788.22 toks/s, output: 3391.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 329/377 [00:22<00:00, 63.59it/s, est. speed input: 6904.55 toks/s, output: 3496.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 336/377 [00:22<00:00, 51.81it/s, est. speed input: 6990.23 toks/s, output: 3592.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:22<00:00, 60.41it/s, est. speed input: 7146.12 toks/s, output: 3742.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 352/377 [00:22<00:00, 50.36it/s, est. speed input: 7230.89 toks/s, output: 3843.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 358/377 [00:23<00:00, 32.48it/s, est. speed input: 7236.69 toks/s, output: 3899.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 363/377 [00:23<00:00, 29.52it/s, est. speed input: 7274.57 toks/s, output: 3970.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 367/377 [00:23<00:00, 27.62it/s, est. speed input: 7301.75 toks/s, output: 4028.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:24<00:00, 11.88it/s, est. speed input: 7090.45 toks/s, output: 3965.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:38<00:00, 11.88it/s, est. speed input: 6968.17 toks/s, output: 3931.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [00:52<00:05, 1.91s/it, est. speed input: 3350.97 toks/s, output: 1964.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 375/377 [00:57<00:04, 2.14s/it, est. speed input: 3093.65 toks/s, output: 1892.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 377/377 [01:03<00:00, 2.31s/it, est. speed input: 2817.06 toks/s, output: 1887.75 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:03<00:00, 5.96it/s, est. speed input: 2817.06 toks/s, output: 1887.75 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.06535271555185318, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00033140595769509673}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.14596283435821533, 'actor/pg_clipfrac': 0.0012642225483432412, 'actor/ppo_kl': -0.0008954893564805388}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.14104847609996796, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011890372261404991}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0001277610135730356, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.656307232333347e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00015256843471433967, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003310620959382504}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.603641927242279, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002205336233600974}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1785268634557724, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00030687873368151486}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.313791424036026, 'actor/pg_clipfrac': 0.0008445946150459349, 'actor/ppo_kl': 0.001232791575603187}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.13630330562591553, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00012384227011352777}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.054066065698862076, 'actor/pg_clipfrac': 0.0009259259095415473, 'actor/ppo_kl': -0.0009252601303160191}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.15079785883426666, 'actor/pg_clipfrac': 0.0005227391375228763, 'actor/ppo_kl': 0.0001234053634107113}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00022545339015778154, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005776068428531289}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002380597870796919, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006549076642841101}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.18868602812290192, 'actor/pg_clipfrac': 0.0009541984763927758, 'actor/ppo_kl': 0.000166605444974266}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0001866240199888125, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001478008198319003}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.19703061878681183, 'actor/pg_clipfrac': 0.0012690355069935322, 'actor/ppo_kl': -0.0004489603452384472}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5398077964782715, 'actor/pg_clipfrac': 0.0010582010727375746, 'actor/ppo_kl': 0.0025261915288865566}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.10940119624137878, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006387294852174819}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.30192694067955017, 'actor/pg_clipfrac': 0.005576208233833313, 'actor/ppo_kl': -0.00015108824300114065}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.1774669587612152, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004776650748681277}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00016357385902665555, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006630933494307101}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0001738962746458128, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012729474110528827}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.68 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:41:52 [executor_base.py:219] It took 0.338983 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.59 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:41:52 [executor_base.py:219] It took 0.339402 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:43:40 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:43:41 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:43:41 [executor_base.py:208] It took 0.326187 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:43:41 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:43:41 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:43:41 [executor_base.py:208] It took 0.327329 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to understand the geometry of the situation. Since D is the midpoint of AB, AD = BD = 2.0. The perimeter of triangle ABC is given as 16.0. Therefore, AB + AC + BC = 16.0. Since D is the midpoint, AD + BD = AB, so AB = AD + BD = 2 + 2 = 4.0. This means AC + BC = 16 - 4 = 12.0. \n\nThe line l is a vertical line passing through D, and E is any point on l. The perimeter of triangle AEC is AC + EC + AE. Since AE + EC is the length of the line segment AC (because E is on l and l is vertical), the length of AC is minimized when E is at point D. This is because the shortest distance between two points is a straight line, and in this case, D is the point that minimizes the distance from D to AC.\n\nTherefore, the minimum perimeter of triangle AEC is AC, which is 12.0.\n\n\n12.0\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven the problem, we have a right triangle \\(ABC\\) with \\(\\angle BAC = 90^\\circ\\). When triangle \\(ABC\\) is rotated clockwise around point \\(A\\) by \\(90^\\circ\\), it forms triangle \\(AB'C'C\\). We need to find the measure of \\(\\angle AC'B'\\) given that \\(\\angle CC'B' = 32^\\circ\\).\n\nSince the rotation is by \\(90^\\circ\\) and the triangle is rotated around \\(A\\), the angles at \\(B\\) and \\(C\\) in the original triangle \\(ABC\\) will correspond to the angles at \\(B'\\) and \\(C'\\) in the rotated triangle \\(AB'C'C\\). Therefore, \\(\\angle BAC = 90^\\circ\\) remains \\(90^\\circ\\) in the rotated triangle.\n\nIn the rotated triangle, \\(\\angle B'AC' = 90^\\circ\\) and \\(\\angle AB'C' = \\angle ABC\\) because the rotation preserves angles. The angle \\(\\angle CC'B'\\) is given as \\(32^\\circ\\), which is the same as \\(\\angle AC'B'\\) because the rotation preserves the angle measure.\n\nThus, \\(\\angle AC'B' = 32^\\circ\\).\n\n\n\n32.0\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that A, B, and D are on circle O, and C is on the diameter BE, we can use properties of circles and tangents to solve this problem. Since A, B, and D are on the circle, angle ADC is an inscribed angle that subtends arc AB. The measure of an inscribed angle is half the measure of the central angle that subtends the same arc. Therefore, angle ADB (which is the same as angle ADC) is half the measure of the central angle subtended by arc AB. Given that angle ADC is 54 degrees, the central angle subtended by arc AB is 2 * 54 = 108 degrees.\n\nSince BE is a diameter, angle AEB is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. This is due to the fact that angle AEB subtends the diameter BE, which is the longest chord in a circle, and thus the angle opposite the diameter is a right angle.\n\nTherefore, angle AEB = 90 degrees.\n\n\nThe degree of angle AEB is 90 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the angle C, we need to use the properties of tangents and the circle. Since CD is tangent to the circle at point D, angle ODC is 90 degrees (this is a well-known property of tangents to a circle). Given that angle A is 35 degrees, angle AOD is also 35 degrees because they are subtended by the same arc AD. Since angle ODC is 90 degrees, angle C can be found by subtracting the angle AOD from 90 degrees. Therefore, angle C = 90 - 35 = 55 degrees.\n\n\n55 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, angle ACD is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Since EA is a tangent to circle O, angle EAC is given as 120 degrees. The angle ACB is half of angle ACD because the angle subtended by a chord at the circumference is half the angle subtended at the center. Therefore, angle ACB = 45 degrees. Since angle ABC is part of the triangle ABC and we know angle ACB, we can find angle ABC by subtracting angle ACB from 180 degrees (since the sum of angles in a triangle is 180 degrees).\n\n\n30 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_10
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_25/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_25/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_25/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 25
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.259
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.012
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.019
[36m(Runner pid=3309020)[0m ppo_kl: 1.0363909764610213e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.03
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.03
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.633
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m score:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:33:07, 4.38s/it, est. speed input: 101.32 toks/s, output: 23.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:27<52:08, 2.46s/it, est. speed input: 162.30 toks/s, output: 39.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:27<28:37, 1.36s/it, est. speed input: 239.32 toks/s, output: 58.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<13:43, 1.52it/s, est. speed input: 386.96 toks/s, output: 98.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<12:27, 1.67it/s, est. speed input: 427.96 toks/s, output: 111.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<09:59, 2.08it/s, est. speed input: 487.74 toks/s, output: 129.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:32<07:28, 2.76it/s, est. speed input: 556.20 toks/s, output: 152.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:32<04:14, 4.82it/s, est. speed input: 690.01 toks/s, output: 190.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:32<03:20, 6.10it/s, est. speed input: 756.77 toks/s, output: 208.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:33<02:47, 7.28it/s, est. speed input: 821.54 toks/s, output: 228.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:33<01:43, 11.72it/s, est. speed input: 951.20 toks/s, output: 273.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:33<01:10, 17.01it/s, est. speed input: 1085.20 toks/s, output: 316.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:33<01:03, 18.95it/s, est. speed input: 1143.21 toks/s, output: 335.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:34<01:09, 17.23it/s, est. speed input: 1198.52 toks/s, output: 355.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:34<01:20, 14.67it/s, est. speed input: 1248.83 toks/s, output: 373.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:34<01:17, 15.19it/s, est. speed input: 1306.33 toks/s, output: 393.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:35<01:05, 17.87it/s, est. speed input: 1359.36 toks/s, output: 415.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:35<01:02, 18.56it/s, est. speed input: 1465.87 toks/s, output: 454.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:35<00:39, 28.91it/s, est. speed input: 1652.85 toks/s, output: 513.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:36<00:33, 33.41it/s, est. speed input: 1830.78 toks/s, output: 576.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:36<00:37, 30.26it/s, est. speed input: 1939.95 toks/s, output: 616.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:36<00:30, 36.41it/s, est. speed input: 2053.19 toks/s, output: 661.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:36<00:30, 36.55it/s, est. speed input: 2104.72 toks/s, output: 667.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:36<00:30, 36.59it/s, est. speed input: 2161.88 toks/s, output: 689.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:37<00:32, 33.87it/s, est. speed input: 2265.59 toks/s, output: 729.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:37<00:36, 29.64it/s, est. speed input: 2410.67 toks/s, output: 790.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:37<00:36, 29.32it/s, est. speed input: 2458.10 toks/s, output: 810.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:38<00:51, 20.59it/s, est. speed input: 2479.58 toks/s, output: 825.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:38<00:39, 26.75it/s, est. speed input: 2586.55 toks/s, output: 877.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:38<00:32, 32.65it/s, est. speed input: 2681.13 toks/s, output: 915.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:39<00:29, 35.52it/s, est. speed input: 2788.68 toks/s, output: 954.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:39<00:28, 36.84it/s, est. speed input: 2837.92 toks/s, output: 973.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:39<00:16, 62.35it/s, est. speed input: 3066.10 toks/s, output: 1059.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:39<00:16, 61.54it/s, est. speed input: 3170.98 toks/s, output: 1112.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:39<00:15, 63.41it/s, est. speed input: 3322.70 toks/s, output: 1182.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:40<00:17, 56.82it/s, est. speed input: 3474.22 toks/s, output: 1242.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:40<00:16, 58.70it/s, est. speed input: 3574.51 toks/s, output: 1289.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:40<00:14, 65.64it/s, est. speed input: 3685.23 toks/s, output: 1346.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:40<00:12, 74.86it/s, est. speed input: 3838.65 toks/s, output: 1426.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:40<00:13, 66.66it/s, est. speed input: 3937.51 toks/s, output: 1469.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:40<00:17, 52.86it/s, est. speed input: 4020.30 toks/s, output: 1503.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:41<00:18, 49.01it/s, est. speed input: 4109.94 toks/s, output: 1550.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:41<00:15, 56.90it/s, est. speed input: 4252.41 toks/s, output: 1628.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:41<00:12, 71.34it/s, est. speed input: 4461.71 toks/s, output: 1723.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:41<00:11, 72.51it/s, est. speed input: 4555.69 toks/s, output: 1775.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:41<00:13, 63.85it/s, est. speed input: 4643.14 toks/s, output: 1813.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:42<00:11, 75.91it/s, est. speed input: 4791.71 toks/s, output: 1874.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:42<00:09, 86.15it/s, est. speed input: 4947.85 toks/s, output: 1954.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:42<00:11, 68.14it/s, est. speed input: 5030.67 toks/s, output: 2009.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:42<00:09, 80.37it/s, est. speed input: 5188.64 toks/s, output: 2078.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:42<00:09, 78.68it/s, est. speed input: 5322.45 toks/s, output: 2134.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:42<00:09, 77.69it/s, est. speed input: 5458.24 toks/s, output: 2200.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:43<00:09, 79.31it/s, est. speed input: 5551.15 toks/s, output: 2255.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:43<00:09, 81.21it/s, est. speed input: 5646.15 toks/s, output: 2308.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:43<00:07, 93.63it/s, est. speed input: 5798.88 toks/s, output: 2389.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:43<00:09, 80.48it/s, est. speed input: 5890.80 toks/s, output: 2451.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:43<00:12, 58.44it/s, est. speed input: 5952.30 toks/s, output: 2498.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:43<00:09, 71.01it/s, est. speed input: 6089.79 toks/s, output: 2587.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:44<00:08, 81.86it/s, est. speed input: 6228.96 toks/s, output: 2655.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:44<00:09, 68.53it/s, est. speed input: 6304.43 toks/s, output: 2703.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:44<00:10, 64.00it/s, est. speed input: 6382.51 toks/s, output: 2736.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:44<00:12, 53.30it/s, est. speed input: 6453.57 toks/s, output: 2776.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:44<00:12, 52.42it/s, est. speed input: 6571.62 toks/s, output: 2849.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:45<00:08, 74.46it/s, est. speed input: 6756.31 toks/s, output: 2956.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:45<00:04, 119.27it/s, est. speed input: 7108.15 toks/s, output: 3169.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:45<00:04, 123.31it/s, est. speed input: 7333.64 toks/s, output: 3299.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:45<00:04, 119.39it/s, est. speed input: 7466.00 toks/s, output: 3350.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:45<00:04, 122.66it/s, est. speed input: 7599.38 toks/s, output: 3434.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:45<00:04, 119.46it/s, est. speed input: 7768.94 toks/s, output: 3548.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:46<00:04, 106.43it/s, est. speed input: 7888.44 toks/s, output: 3628.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:46<00:04, 107.02it/s, est. speed input: 8055.91 toks/s, output: 3747.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:46<00:03, 116.11it/s, est. speed input: 8276.95 toks/s, output: 3920.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:46<00:03, 113.65it/s, est. speed input: 8408.47 toks/s, output: 4008.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:46<00:03, 126.71it/s, est. speed input: 8619.45 toks/s, output: 4147.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:46<00:02, 133.49it/s, est. speed input: 8790.65 toks/s, output: 4261.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:46<00:03, 122.03it/s, est. speed input: 8903.05 toks/s, output: 4360.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:47<00:03, 105.35it/s, est. speed input: 9013.39 toks/s, output: 4427.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:47<00:02, 130.14it/s, est. speed input: 9242.62 toks/s, output: 4608.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:47<00:02, 120.79it/s, est. speed input: 9364.83 toks/s, output: 4692.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:47<00:01, 152.99it/s, est. speed input: 9632.15 toks/s, output: 4918.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:47<00:01, 143.25it/s, est. speed input: 9803.47 toks/s, output: 5073.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:47<00:01, 132.64it/s, est. speed input: 10008.62 toks/s, output: 5253.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:48<00:01, 119.16it/s, est. speed input: 10117.29 toks/s, output: 5348.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:48<00:01, 121.02it/s, est. speed input: 10238.12 toks/s, output: 5470.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:48<00:01, 127.15it/s, est. speed input: 10354.60 toks/s, output: 5566.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:48<00:01, 123.87it/s, est. speed input: 10465.58 toks/s, output: 5644.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:48<00:01, 91.83it/s, est. speed input: 10549.32 toks/s, output: 5704.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:48<00:01, 117.74it/s, est. speed input: 10764.31 toks/s, output: 5867.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:49<00:01, 105.54it/s, est. speed input: 10873.20 toks/s, output: 5987.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:49<00:00, 107.07it/s, est. speed input: 11054.09 toks/s, output: 6206.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:49<00:00, 94.33it/s, est. speed input: 11186.49 toks/s, output: 6337.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:49<00:00, 78.21it/s, est. speed input: 11254.98 toks/s, output: 6411.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:49<00:00, 77.04it/s, est. speed input: 11320.11 toks/s, output: 6474.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:50<00:00, 80.62it/s, est. speed input: 11398.36 toks/s, output: 6555.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:50<00:00, 47.93it/s, est. speed input: 11382.45 toks/s, output: 6575.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:50<00:00, 47.65it/s, est. speed input: 11431.66 toks/s, output: 6647.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:51<00:00, 35.97it/s, est. speed input: 11418.32 toks/s, output: 6687.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:51<00:00, 35.98it/s, est. speed input: 11438.94 toks/s, output: 6726.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:51<00:00, 28.52it/s, est. speed input: 11409.45 toks/s, output: 6713.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:54<00:00, 7.32it/s, est. speed input: 10903.21 toks/s, output: 6443.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:00<00:00, 2.49it/s, est. speed input: 9785.82 toks/s, output: 5817.46 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:00<00:00, 21.06it/s, est. speed input: 9785.82 toks/s, output: 5817.46 toks/s]
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.633
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 937784
[36m(Runner pid=3309020)[0m balanced_min: 937784
[36m(Runner pid=3309020)[0m max: 938082
[36m(Runner pid=3309020)[0m mean: 937784.0
[36m(Runner pid=3309020)[0m min: 937486
[36m(Runner pid=3309020)[0m minmax_diff: 596
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 109.188
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.115
[36m(Runner pid=3309020)[0m throughput: 900.717
[36m(Runner pid=3309020)[0m time_per_step: 1041.152
[36m(Runner pid=3309020)[0m total_num_tokens: 1875568
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 671.0
[36m(Runner pid=3309020)[0m mean: 464.801
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1060.0
[36m(Runner pid=3309020)[0m mean: 267.843
[36m(Runner pid=3309020)[0m min: 48.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.269
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.633
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.147
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.048
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.3
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.219
[36m(Runner pid=3309020)[0m gen: 100.929
[36m(Runner pid=3309020)[0m old: 84.53
[36m(Runner pid=3309020)[0m ref: 90.625
[36m(Runner pid=3309020)[0m reward: 6.696
[36m(Runner pid=3309020)[0m save_checkpoint: 31.014
[36m(Runner pid=3309020)[0m step: 1041.152
[36m(Runner pid=3309020)[0m update_actor: 562.336
[36m(Runner pid=3309020)[0m validation: 164.207
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.329
[36m(Runner pid=3309020)[0m format_reward: 0.989
[36m(Runner pid=3309020)[0m overall_reward: 0.66
[36m(Runner pid=3309020)[0m reward_score: 0.66
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.993
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_25/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_25/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_25/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 26; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:44:36 [executor_base.py:219] It took 0.343838 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:46:03 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:44:36 [executor_base.py:219] It took 0.341128 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:46:04 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:46:04 [executor_base.py:208] It took 0.327568 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:46:11 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:46:12 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:46:12 [executor_base.py:208] It took 0.328898 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.3126097023487091, 'actor/pg_clipfrac': 0.0027777778450399637, 'actor/ppo_kl': 0.0008265998912975192}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.41673827171325684, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.22528593242168427, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005147674237377942}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0003481849853415042, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025706819724291563}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5503719449043274, 'actor/pg_clipfrac': 0.0007194244535639882, 'actor/ppo_kl': 0.0014925216091796756}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0952424705028534, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000397092109778896}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.25480470061302185, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.05325546860694885, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002501060371287167}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.10347090661525726, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0037453470285981894}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.3402397632598877, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.15078015625476837, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2734117805957794, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001412492711097002}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.2816306948661804, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.3612528443336487, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.08463858813047409, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.4744439125061035, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00022532785078510642}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.062202513217926025, 'actor/pg_clipfrac': 0.0007849293760955334, 'actor/ppo_kl': 0.002017471706494689}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.1381174623966217, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000322174106258899}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002756415051408112, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.003010126296430826}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.006224298384040594, 'actor/pg_clipfrac': 0.0017079418757930398, 'actor/ppo_kl': 0.00014859890507068485}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.19135847687721252, 'actor/pg_clipfrac': 0.0036003601271659136, 'actor/ppo_kl': 0.0003383921575732529}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00015563944180030376, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002454688656143844}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.411765992641449, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014309650287032127}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00019381503807380795, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009386639576405287}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.20803067088127136, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003552921989466995}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.1046764999628067, 'actor/pg_clipfrac': 0.0013157895300537348, 'actor/ppo_kl': -0.0019065957749262452}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.057341158390045166, 'actor/pg_clipfrac': 0.0007283321465365589, 'actor/ppo_kl': -0.0010821695905178785}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2767859399318695, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00027210250846110284}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.526661217212677, 'actor/pg_clipfrac': 0.002998500829562545, 'actor/ppo_kl': -0.002725618425756693}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0002900490944739431, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0023744883947074413}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.5909667015075684, 'actor/pg_clipfrac': 0.0034129691775888205, 'actor/ppo_kl': 0.002229518024250865}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002612174430396408, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006288885488174856}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.12877613306045532, 'actor/pg_clipfrac': 0.005188067443668842, 'actor/ppo_kl': -0.0027585604693740606}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0001753911201376468, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002314153127372265}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.2966535687446594, 'actor/pg_clipfrac': 0.002415458904579282, 'actor/ppo_kl': -0.0012825675075873733}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.22211593389511108, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00042788576683960855}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.09490301460027695, 'actor/pg_clipfrac': 0.002222222276031971, 'actor/ppo_kl': 0.0013186391443014145}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00018409352924209088, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000483066716697067}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.5409629344940186, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000155242916662246}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00015734651242382824, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00023184032761491835}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0001793137489585206, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001033967244438827}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.05412759259343147, 'actor/pg_clipfrac': 0.0014124293811619282, 'actor/ppo_kl': 0.001934595755301416}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.26368623971939087, 'actor/pg_clipfrac': 0.0007017544121481478, 'actor/ppo_kl': -0.0014175843680277467}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.17037490010261536, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002746942627709359}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.38996267318725586, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001049269994837232}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003159311308991164, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0019025028450414538}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.7027022838592529, 'actor/pg_clipfrac': 0.0022650056052953005, 'actor/ppo_kl': -0.0010887504322454333}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.2533852756023407, 'actor/pg_clipfrac': 0.0009606147650629282, 'actor/ppo_kl': 0.002255695406347513}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.9209908843040466, 'actor/pg_clipfrac': 0.005181347019970417, 'actor/ppo_kl': -0.0022614554036408663}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.3273254632949829, 'actor/pg_clipfrac': 0.0018951358506456017, 'actor/ppo_kl': 0.00033729345886968076}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.11732742190361023, 'actor/pg_clipfrac': 0.001226993859745562, 'actor/ppo_kl': -0.0006133659044280648}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.1348111778497696, 'actor/pg_clipfrac': 0.0011890606256201863, 'actor/ppo_kl': 0.0019559883512556553}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.2820188105106354, 'actor/pg_clipfrac': 0.0007535794866271317, 'actor/ppo_kl': 0.0003759934625122696}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.30803191661834717, 'actor/pg_clipfrac': 0.005747126415371895, 'actor/ppo_kl': 0.0004017215978819877}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.16486769914627075, 'actor/pg_clipfrac': 0.0012626262614503503, 'actor/ppo_kl': 0.00017848279094323516}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.0611133798956871, 'actor/pg_clipfrac': 0.0006468305364251137, 'actor/ppo_kl': 0.0011602132581174374}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.11203628033399582, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017087954329326749}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00035511041642166674, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0021956567652523518}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:35:10, 4.48s/it, est. speed input: 100.03 toks/s, output: 20.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<42:54, 2.03s/it, est. speed input: 183.72 toks/s, output: 38.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<26:20, 1.25s/it, est. speed input: 261.47 toks/s, output: 56.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<19:03, 1.10it/s, est. speed input: 327.04 toks/s, output: 74.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<13:19, 1.57it/s, est. speed input: 396.63 toks/s, output: 97.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:29<10:41, 1.95it/s, est. speed input: 454.47 toks/s, output: 113.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<07:58, 2.60it/s, est. speed input: 522.77 toks/s, output: 131.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:30<06:22, 3.25it/s, est. speed input: 585.78 toks/s, output: 148.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:31<03:59, 5.14it/s, est. speed input: 710.48 toks/s, output: 192.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:32<03:12, 6.33it/s, est. speed input: 826.66 toks/s, output: 229.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:33<02:24, 8.39it/s, est. speed input: 951.38 toks/s, output: 266.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:33<02:19, 8.63it/s, est. speed input: 1003.39 toks/s, output: 281.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:33<01:55, 10.41it/s, est. speed input: 1061.50 toks/s, output: 305.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:34<01:47, 11.12it/s, est. speed input: 1118.98 toks/s, output: 325.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:34<01:09, 17.06it/s, est. speed input: 1253.14 toks/s, output: 367.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:34<01:08, 17.24it/s, est. speed input: 1310.55 toks/s, output: 386.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:34<00:36, 32.16it/s, est. speed input: 1553.81 toks/s, output: 475.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:35<00:43, 26.76it/s, est. speed input: 1600.06 toks/s, output: 494.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:35<00:43, 26.35it/s, est. speed input: 1658.72 toks/s, output: 516.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:35<00:38, 29.42it/s, est. speed input: 1773.92 toks/s, output: 557.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:35<00:39, 28.42it/s, est. speed input: 1828.16 toks/s, output: 575.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:36<00:30, 36.89it/s, est. speed input: 1949.77 toks/s, output: 622.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:36<00:21, 52.83it/s, est. speed input: 2126.07 toks/s, output: 692.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:36<00:28, 38.46it/s, est. speed input: 2219.28 toks/s, output: 735.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:37<00:37, 29.22it/s, est. speed input: 2315.58 toks/s, output: 776.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:37<00:29, 36.35it/s, est. speed input: 2435.53 toks/s, output: 826.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:37<00:21, 50.61it/s, est. speed input: 2618.53 toks/s, output: 897.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:37<00:18, 57.06it/s, est. speed input: 2729.98 toks/s, output: 941.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:37<00:19, 52.77it/s, est. speed input: 2836.47 toks/s, output: 987.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:37<00:15, 68.18it/s, est. speed input: 3008.12 toks/s, output: 1069.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:38<00:19, 51.26it/s, est. speed input: 3095.33 toks/s, output: 1105.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:38<00:23, 43.75it/s, est. speed input: 3190.45 toks/s, output: 1138.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:38<00:23, 42.35it/s, est. speed input: 3287.88 toks/s, output: 1178.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:38<00:19, 50.48it/s, est. speed input: 3396.16 toks/s, output: 1225.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:38<00:18, 54.08it/s, est. speed input: 3501.73 toks/s, output: 1276.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:39<00:15, 62.39it/s, est. speed input: 3607.67 toks/s, output: 1328.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:39<00:10, 90.73it/s, est. speed input: 3828.01 toks/s, output: 1412.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:39<00:11, 84.90it/s, est. speed input: 3983.41 toks/s, output: 1477.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:39<00:18, 49.23it/s, est. speed input: 4093.69 toks/s, output: 1535.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:40<00:17, 50.90it/s, est. speed input: 4238.92 toks/s, output: 1597.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:40<00:16, 55.10it/s, est. speed input: 4341.28 toks/s, output: 1647.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:40<00:13, 66.77it/s, est. speed input: 4501.18 toks/s, output: 1723.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:40<00:12, 69.03it/s, est. speed input: 4599.94 toks/s, output: 1778.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:40<00:12, 66.36it/s, est. speed input: 4696.32 toks/s, output: 1824.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:40<00:13, 64.80it/s, est. speed input: 4793.79 toks/s, output: 1873.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:41<00:14, 59.76it/s, est. speed input: 4882.40 toks/s, output: 1907.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:41<00:12, 65.16it/s, est. speed input: 4980.43 toks/s, output: 1964.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:41<00:14, 56.46it/s, est. speed input: 5067.98 toks/s, output: 2011.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:41<00:09, 80.55it/s, est. speed input: 5271.37 toks/s, output: 2108.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:41<00:08, 94.46it/s, est. speed input: 5474.55 toks/s, output: 2208.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:41<00:06, 116.57it/s, est. speed input: 5742.69 toks/s, output: 2337.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:42<00:07, 100.96it/s, est. speed input: 5874.48 toks/s, output: 2385.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:42<00:06, 103.73it/s, est. speed input: 6016.39 toks/s, output: 2451.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:42<00:06, 116.51it/s, est. speed input: 6215.58 toks/s, output: 2554.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:42<00:06, 109.24it/s, est. speed input: 6360.46 toks/s, output: 2622.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:42<00:07, 88.95it/s, est. speed input: 6486.02 toks/s, output: 2696.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:42<00:08, 82.73it/s, est. speed input: 6565.88 toks/s, output: 2746.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:43<00:07, 87.34it/s, est. speed input: 6703.15 toks/s, output: 2832.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:43<00:08, 75.45it/s, est. speed input: 6874.71 toks/s, output: 2938.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:43<00:07, 85.45it/s, est. speed input: 7019.61 toks/s, output: 3004.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:43<00:09, 64.54it/s, est. speed input: 7080.74 toks/s, output: 3040.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:43<00:06, 91.69it/s, est. speed input: 7321.60 toks/s, output: 3169.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:44<00:05, 99.21it/s, est. speed input: 7454.34 toks/s, output: 3242.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:44<00:04, 129.60it/s, est. speed input: 7739.19 toks/s, output: 3407.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:44<00:03, 130.98it/s, est. speed input: 7877.53 toks/s, output: 3500.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:44<00:04, 125.89it/s, est. speed input: 8003.41 toks/s, output: 3576.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:44<00:04, 102.06it/s, est. speed input: 8114.31 toks/s, output: 3667.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:44<00:04, 103.14it/s, est. speed input: 8253.30 toks/s, output: 3751.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:44<00:03, 114.50it/s, est. speed input: 8429.12 toks/s, output: 3863.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:45<00:03, 138.75it/s, est. speed input: 8657.93 toks/s, output: 4012.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:45<00:03, 115.76it/s, est. speed input: 8775.66 toks/s, output: 4084.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:45<00:02, 139.71it/s, est. speed input: 9010.27 toks/s, output: 4217.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:45<00:02, 136.93it/s, est. speed input: 9187.15 toks/s, output: 4337.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:45<00:03, 117.39it/s, est. speed input: 9304.72 toks/s, output: 4437.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:45<00:02, 126.60it/s, est. speed input: 9480.46 toks/s, output: 4569.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:45<00:01, 156.27it/s, est. speed input: 9756.05 toks/s, output: 4765.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:46<00:01, 158.30it/s, est. speed input: 9929.28 toks/s, output: 4891.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:46<00:01, 165.20it/s, est. speed input: 10114.26 toks/s, output: 5042.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:46<00:01, 151.86it/s, est. speed input: 10283.13 toks/s, output: 5178.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:46<00:01, 162.59it/s, est. speed input: 10494.45 toks/s, output: 5325.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:46<00:01, 176.84it/s, est. speed input: 10773.83 toks/s, output: 5533.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:46<00:01, 114.02it/s, est. speed input: 10896.37 toks/s, output: 5641.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:47<00:01, 102.17it/s, est. speed input: 10997.37 toks/s, output: 5723.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:47<00:01, 105.56it/s, est. speed input: 11150.03 toks/s, output: 5876.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:47<00:01, 84.01it/s, est. speed input: 11226.84 toks/s, output: 5953.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:47<00:00, 102.07it/s, est. speed input: 11433.68 toks/s, output: 6123.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:47<00:00, 109.92it/s, est. speed input: 11552.81 toks/s, output: 6237.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:48<00:00, 73.30it/s, est. speed input: 11600.15 toks/s, output: 6319.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:48<00:00, 68.73it/s, est. speed input: 11650.14 toks/s, output: 6395.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:48<00:00, 71.42it/s, est. speed input: 11711.25 toks/s, output: 6467.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:48<00:00, 57.77it/s, est. speed input: 11742.22 toks/s, output: 6540.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:49<00:00, 44.58it/s, est. speed input: 11745.91 toks/s, output: 6614.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:49<00:00, 50.35it/s, est. speed input: 11812.02 toks/s, output: 6677.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:49<00:00, 38.75it/s, est. speed input: 11810.31 toks/s, output: 6716.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:51<00:00, 17.05it/s, est. speed input: 11576.38 toks/s, output: 6646.59 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:51<00:00, 25.00it/s, est. speed input: 11576.38 toks/s, output: 6646.59 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00024444653536193073, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007343121687881649}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.15118123590946198, 'actor/pg_clipfrac': 0.0015637216856703162, 'actor/ppo_kl': 0.001014764653518796}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.07352756708860397, 'actor/pg_clipfrac': 0.001923076924867928, 'actor/ppo_kl': 0.0016449103131890297}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.5059404969215393, 'actor/pg_clipfrac': 0.001476014731451869, 'actor/ppo_kl': -0.00039832337643019855}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3283170163631439, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015712225576862693}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00023848791897762567, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0024103305768221617}
[36m(Runner pid=3309020)[0m Step 26
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.259
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.019
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.009
[36m(Runner pid=3309020)[0m ppo_kl: 1.3681143782662275e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.023
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.023
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.63
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.63
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 954949
[36m(Runner pid=3309020)[0m balanced_min: 954724
[36m(Runner pid=3309020)[0m max: 961285
[36m(Runner pid=3309020)[0m mean: 954836.5
[36m(Runner pid=3309020)[0m min: 948388
[36m(Runner pid=3309020)[0m minmax_diff: 12897
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 109.469
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.117
[36m(Runner pid=3309020)[0m throughput: 1116.688
[36m(Runner pid=3309020)[0m time_per_step: 855.061
[36m(Runner pid=3309020)[0m total_num_tokens: 1909673
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 465.271
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2197.0
[36m(Runner pid=3309020)[0m mean: 280.695
[36m(Runner pid=3309020)[0m min: 48.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.263
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.63
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.154
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.295
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.208
[36m(Runner pid=3309020)[0m gen: 110.336
[36m(Runner pid=3309020)[0m old: 87.545
[36m(Runner pid=3309020)[0m ref: 87.319
[36m(Runner pid=3309020)[0m reward: 6.475
[36m(Runner pid=3309020)[0m step: 855.061
[36m(Runner pid=3309020)[0m update_actor: 562.55
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 27; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 03:58:54 [executor_base.py:219] It took 0.341251 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:00:17 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 03:58:54 [executor_base.py:219] It took 0.341011 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:00:18 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:00:18 [executor_base.py:208] It took 0.326813 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:00:42 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:00:42 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:00:42 [executor_base.py:208] It took 0.328852 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0067062461748719215, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00026315610739402473, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002247322117909789, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.32925477623939514, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.210078164935112, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.10335274785757065, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4717327952384949, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0019927052780985832}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.29070422053337097, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.4199291467666626, 'actor/pg_clipfrac': 0.0005858230870217085, 'actor/ppo_kl': -0.00017876281344797462}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00021372536139097065, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00010608058073557913}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.05791819840669632, 'actor/pg_clipfrac': 0.0011947430903092027, 'actor/ppo_kl': -0.0001128469011746347}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.3647131025791168, 'actor/pg_clipfrac': 0.0015923567116260529, 'actor/ppo_kl': -0.00041379747563041747}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0001811489782994613, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.32844221591949463, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00019417586736381054, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005104720476083457}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.0497802272439003, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.4255426228046417, 'actor/pg_clipfrac': 0.0007107320707291365, 'actor/ppo_kl': 0.0008872804464772344}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.3987509310245514, 'actor/pg_clipfrac': 0.0028860028833150864, 'actor/ppo_kl': 0.002268899930641055}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.08124402910470963, 'actor/pg_clipfrac': 0.0008196721319109201, 'actor/ppo_kl': 0.000148215054650791}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00018673806334845722, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005131313228048384}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0001700080028967932, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025303169968537986}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.12367945164442062, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00032199479755945504}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.15028168261051178, 'actor/pg_clipfrac': 0.0010111223673447967, 'actor/ppo_kl': -0.0014516330556944013}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.24886095523834229, 'actor/pg_clipfrac': 0.001490312977693975, 'actor/ppo_kl': 0.0012921703746542335}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.4469743072986603, 'actor/pg_clipfrac': 0.001917545567266643, 'actor/ppo_kl': 0.0006691703456453979}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.33086398243904114, 'actor/pg_clipfrac': 0.002249718876555562, 'actor/ppo_kl': -0.001506029861047864}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00023008565767668188, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000798246415797621}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.4972396492958069, 'actor/pg_clipfrac': 0.0007886435487307608, 'actor/ppo_kl': 0.0002616136334836483}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.013322653248906136, 'actor/pg_clipfrac': 0.0014326648088172078, 'actor/ppo_kl': -0.0009207356488332152}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.07392037659883499, 'actor/pg_clipfrac': 0.0019132653251290321, 'actor/ppo_kl': -0.0009543689084239304}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.1976437121629715, 'actor/pg_clipfrac': 0.0010869564721360803, 'actor/ppo_kl': 0.001954665407538414}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.5158159136772156, 'actor/pg_clipfrac': 0.0022321429569274187, 'actor/ppo_kl': -0.0026892146561294794}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.26585376262664795, 'actor/pg_clipfrac': 0.0014104372821748257, 'actor/ppo_kl': -0.00013132997264619917}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.22443845868110657, 'actor/pg_clipfrac': 0.003401360474526882, 'actor/ppo_kl': -0.0008132960647344589}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.09192174673080444, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016057105967774987}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.15212216973304749, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009163940558210015}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0001181785628432408, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00014331337297335267}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00018962027388624847, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000571662385482341}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.35788869857788086, 'actor/pg_clipfrac': 0.0013227512827143073, 'actor/ppo_kl': 0.0012047833297401667}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00013726079487241805, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004877573228441179}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.21731746196746826, 'actor/pg_clipfrac': 0.000895255128853023, 'actor/ppo_kl': -0.0004683778970502317}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00014434482727665454, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006158478790894151}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.26656967401504517, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007822918123565614}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.05714861676096916, 'actor/pg_clipfrac': 0.001243781065568328, 'actor/ppo_kl': 0.00038902557571418583}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.32411742210388184, 'actor/pg_clipfrac': 0.0017761989729478955, 'actor/ppo_kl': -0.001428046845830977}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.6153585314750671, 'actor/pg_clipfrac': 0.0007974481559358537, 'actor/ppo_kl': 0.0011819523060694337}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2095041573047638, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0035317738074809313}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.24956397712230682, 'actor/pg_clipfrac': 0.0007278020493686199, 'actor/ppo_kl': 0.0002807397977449}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.2590050995349884, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000903154956176877}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2238217443227768, 'actor/pg_clipfrac': 0.001280409749597311, 'actor/ppo_kl': -0.0016327978810295463}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.3641812801361084, 'actor/pg_clipfrac': 0.0009551098337396979, 'actor/ppo_kl': -9.220308857038617e-05}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:26<1:51:29, 5.25s/it, est. speed input: 86.72 toks/s, output: 22.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:28<50:15, 2.37s/it, est. speed input: 164.86 toks/s, output: 43.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:28<28:55, 1.37s/it, est. speed input: 235.79 toks/s, output: 62.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:30<19:30, 1.08it/s, est. speed input: 303.84 toks/s, output: 80.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<12:59, 1.61it/s, est. speed input: 372.36 toks/s, output: 101.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<09:22, 2.22it/s, est. speed input: 442.44 toks/s, output: 119.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<06:43, 3.08it/s, est. speed input: 522.99 toks/s, output: 138.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<04:46, 4.33it/s, est. speed input: 585.48 toks/s, output: 157.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:31<03:34, 5.76it/s, est. speed input: 651.56 toks/s, output: 178.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:32<02:57, 6.93it/s, est. speed input: 711.76 toks/s, output: 194.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:32<02:20, 8.73it/s, est. speed input: 778.30 toks/s, output: 218.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:32<01:22, 14.64it/s, est. speed input: 908.83 toks/s, output: 258.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:32<01:20, 15.04it/s, est. speed input: 970.33 toks/s, output: 275.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:33<01:36, 12.50it/s, est. speed input: 1020.94 toks/s, output: 290.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:33<01:09, 17.07it/s, est. speed input: 1150.81 toks/s, output: 330.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:34<01:13, 16.19it/s, est. speed input: 1266.20 toks/s, output: 372.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:34<01:11, 16.47it/s, est. speed input: 1318.31 toks/s, output: 391.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:35<00:40, 28.49it/s, est. speed input: 1562.72 toks/s, output: 480.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:35<00:38, 30.25it/s, est. speed input: 1677.90 toks/s, output: 523.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:35<00:41, 27.65it/s, est. speed input: 1847.07 toks/s, output: 587.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:36<00:33, 33.11it/s, est. speed input: 1964.68 toks/s, output: 625.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:36<00:35, 31.38it/s, est. speed input: 2013.68 toks/s, output: 644.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:36<00:37, 29.89it/s, est. speed input: 2061.37 toks/s, output: 666.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:36<00:29, 37.73it/s, est. speed input: 2175.62 toks/s, output: 710.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:36<00:29, 37.72it/s, est. speed input: 2229.45 toks/s, output: 735.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:36<00:23, 46.17it/s, est. speed input: 2345.42 toks/s, output: 774.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:37<00:26, 40.80it/s, est. speed input: 2446.82 toks/s, output: 824.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:37<00:27, 38.26it/s, est. speed input: 2554.57 toks/s, output: 867.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:37<00:32, 33.21it/s, est. speed input: 2595.56 toks/s, output: 886.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:38<01:03, 16.61it/s, est. speed input: 2602.03 toks/s, output: 889.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:38<00:37, 27.64it/s, est. speed input: 2816.51 toks/s, output: 977.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:39<00:43, 23.83it/s, est. speed input: 2850.69 toks/s, output: 993.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:39<00:41, 25.12it/s, est. speed input: 2898.55 toks/s, output: 1015.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:39<00:31, 31.94it/s, est. speed input: 2999.96 toks/s, output: 1069.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:39<00:26, 38.68it/s, est. speed input: 3102.67 toks/s, output: 1111.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:39<00:20, 48.15it/s, est. speed input: 3205.53 toks/s, output: 1164.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:40<00:20, 48.67it/s, est. speed input: 3310.14 toks/s, output: 1219.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:40<00:12, 75.86it/s, est. speed input: 3526.90 toks/s, output: 1323.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:40<00:12, 75.82it/s, est. speed input: 3689.44 toks/s, output: 1411.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:40<00:12, 73.95it/s, est. speed input: 3797.25 toks/s, output: 1476.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:40<00:19, 47.82it/s, est. speed input: 3870.35 toks/s, output: 1520.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:41<00:12, 70.29it/s, est. speed input: 4134.79 toks/s, output: 1652.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:41<00:14, 63.22it/s, est. speed input: 4265.65 toks/s, output: 1738.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:41<00:11, 73.48it/s, est. speed input: 4425.75 toks/s, output: 1814.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:41<00:11, 73.16it/s, est. speed input: 4579.88 toks/s, output: 1884.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:41<00:09, 85.55it/s, est. speed input: 4780.38 toks/s, output: 1976.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:42<00:07, 111.22it/s, est. speed input: 5039.57 toks/s, output: 2104.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:42<00:09, 83.70it/s, est. speed input: 5167.40 toks/s, output: 2183.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:42<00:07, 100.65it/s, est. speed input: 5365.23 toks/s, output: 2282.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:42<00:08, 86.65it/s, est. speed input: 5501.77 toks/s, output: 2338.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:42<00:08, 91.83it/s, est. speed input: 5648.70 toks/s, output: 2421.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:42<00:06, 105.68it/s, est. speed input: 5849.02 toks/s, output: 2529.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:43<00:05, 136.55it/s, est. speed input: 6160.38 toks/s, output: 2671.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:43<00:05, 118.72it/s, est. speed input: 6339.73 toks/s, output: 2772.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:43<00:04, 138.60it/s, est. speed input: 6634.47 toks/s, output: 2924.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:43<00:05, 123.33it/s, est. speed input: 6824.17 toks/s, output: 3029.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:43<00:04, 142.03it/s, est. speed input: 7074.21 toks/s, output: 3176.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:44<00:04, 120.79it/s, est. speed input: 7271.71 toks/s, output: 3296.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:44<00:05, 101.91it/s, est. speed input: 7390.07 toks/s, output: 3370.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:44<00:04, 118.39it/s, est. speed input: 7598.11 toks/s, output: 3481.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:44<00:05, 100.59it/s, est. speed input: 7714.51 toks/s, output: 3566.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:44<00:06, 82.04it/s, est. speed input: 7812.43 toks/s, output: 3622.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:45<00:07, 68.13it/s, est. speed input: 7870.06 toks/s, output: 3669.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:45<00:05, 85.00it/s, est. speed input: 8045.40 toks/s, output: 3768.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:45<00:04, 96.03it/s, est. speed input: 8184.67 toks/s, output: 3861.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:45<00:04, 106.77it/s, est. speed input: 8323.13 toks/s, output: 3947.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:45<00:04, 111.95it/s, est. speed input: 8446.63 toks/s, output: 4032.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:45<00:04, 106.50it/s, est. speed input: 8572.82 toks/s, output: 4115.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:45<00:03, 122.76it/s, est. speed input: 8762.94 toks/s, output: 4242.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:45<00:03, 118.02it/s, est. speed input: 8888.17 toks/s, output: 4331.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:46<00:03, 121.08it/s, est. speed input: 9012.49 toks/s, output: 4424.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:46<00:04, 87.21it/s, est. speed input: 9105.27 toks/s, output: 4497.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:46<00:03, 104.03it/s, est. speed input: 9318.23 toks/s, output: 4671.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:46<00:03, 105.43it/s, est. speed input: 9437.19 toks/s, output: 4762.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:46<00:02, 108.47it/s, est. speed input: 9556.92 toks/s, output: 4847.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:46<00:01, 142.84it/s, est. speed input: 9827.14 toks/s, output: 5039.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:47<00:02, 97.43it/s, est. speed input: 9948.05 toks/s, output: 5125.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:47<00:02, 94.79it/s, est. speed input: 10059.23 toks/s, output: 5231.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:47<00:02, 97.70it/s, est. speed input: 10174.17 toks/s, output: 5335.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:47<00:02, 103.10it/s, est. speed input: 10336.76 toks/s, output: 5484.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:47<00:02, 97.66it/s, est. speed input: 10444.59 toks/s, output: 5592.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:48<00:01, 126.70it/s, est. speed input: 10692.31 toks/s, output: 5813.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:48<00:01, 111.59it/s, est. speed input: 10789.79 toks/s, output: 5895.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:48<00:01, 105.88it/s, est. speed input: 10902.47 toks/s, output: 6006.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:48<00:01, 100.97it/s, est. speed input: 11009.38 toks/s, output: 6106.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:48<00:01, 86.20it/s, est. speed input: 11106.62 toks/s, output: 6199.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:49<00:01, 70.81it/s, est. speed input: 11154.46 toks/s, output: 6270.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:49<00:01, 71.53it/s, est. speed input: 11216.98 toks/s, output: 6330.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:49<00:01, 73.17it/s, est. speed input: 11281.99 toks/s, output: 6398.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:49<00:00, 87.80it/s, est. speed input: 11399.32 toks/s, output: 6540.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:49<00:00, 80.70it/s, est. speed input: 11466.69 toks/s, output: 6610.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:49<00:00, 78.68it/s, est. speed input: 11526.45 toks/s, output: 6696.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:50<00:00, 51.47it/s, est. speed input: 11529.90 toks/s, output: 6747.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:50<00:00, 44.94it/s, est. speed input: 11555.87 toks/s, output: 6801.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:50<00:00, 36.58it/s, est. speed input: 11551.06 toks/s, output: 6839.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:50<00:00, 36.89it/s, est. speed input: 11564.47 toks/s, output: 6876.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:52<00:00, 10.81it/s, est. speed input: 11211.40 toks/s, output: 6701.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 4.40it/s, est. speed input: 10539.28 toks/s, output: 6333.24 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.71it/s, est. speed input: 10539.28 toks/s, output: 6333.24 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.4837260842323303, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001835967181250453}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5784692168235779, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010225388687103987}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00019234763749409467, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016119498759508133}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00023365077504422516, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010495086899027228}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.5293020606040955, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.9813684048131108e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.20127259194850922, 'actor/pg_clipfrac': 0.0022354694083333015, 'actor/ppo_kl': 0.0016093417070806026}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.5719771981239319, 'actor/pg_clipfrac': 0.0006447453051805496, 'actor/ppo_kl': 0.0005510786431841552}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.40066227316856384, 'actor/pg_clipfrac': 0.0017809439450502396, 'actor/ppo_kl': 3.661158189061098e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00013212283374741673, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002485821896698326}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.06405516713857651, 'actor/pg_clipfrac': 0.000834724516607821, 'actor/ppo_kl': 0.0018879774725064635}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.05754696577787399, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009750769240781665}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00020383640367072076, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008942458080127835}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.044927097856998444, 'actor/pg_clipfrac': 0.001055966247804463, 'actor/ppo_kl': 0.001462052226997912}
[36m(Runner pid=3309020)[0m Step 27
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.259
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.019
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.003
[36m(Runner pid=3309020)[0m ppo_kl: 9.786154300783778e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.009
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.009
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.627
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.627
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 944017
[36m(Runner pid=3309020)[0m balanced_min: 944017
[36m(Runner pid=3309020)[0m max: 955058
[36m(Runner pid=3309020)[0m mean: 944017.0
[36m(Runner pid=3309020)[0m min: 932976
[36m(Runner pid=3309020)[0m minmax_diff: 22082
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 110.069
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.116
[36m(Runner pid=3309020)[0m throughput: 1090.193
[36m(Runner pid=3309020)[0m time_per_step: 865.917
[36m(Runner pid=3309020)[0m total_num_tokens: 1888034
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 671.0
[36m(Runner pid=3309020)[0m mean: 465.236
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1117.0
[36m(Runner pid=3309020)[0m mean: 272.277
[36m(Runner pid=3309020)[0m min: 47.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.256
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.627
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.334945556016133e-05
[36m(Runner pid=3309020)[0m gen: 0.178
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.297
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.176
[36m(Runner pid=3309020)[0m gen: 123.835
[36m(Runner pid=3309020)[0m old: 85.921
[36m(Runner pid=3309020)[0m ref: 86.984
[36m(Runner pid=3309020)[0m reward: 6.573
[36m(Runner pid=3309020)[0m step: 865.917
[36m(Runner pid=3309020)[0m update_actor: 561.504
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 28; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:13:21 [executor_base.py:219] It took 0.340983 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:14:44 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:13:21 [executor_base.py:219] It took 0.339956 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:14:45 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:14:45 [executor_base.py:208] It took 0.327186 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.82 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:14:49 [block_pool.py:255] Successfully reset prefix cache
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:14:49 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:14:49 [executor_base.py:208] It took 0.326734 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00013950394350104034, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00047366853686980903}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002173367829527706, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002836724743247032}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0001343063049716875, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009529810631647706}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.37500709295272827, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.39853253960609436, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004113120958209038}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.18049491941928864, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00016633410996291786, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00021799920068588108, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006824081647209823}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00013060099445283413, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000531016499735415}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.08209361135959625, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.23241867125034332, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.07466170936822891, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00017141617718152702}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.08439435809850693, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.20421738922595978, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6946604251861572, 'actor/pg_clipfrac': 0.001754386001266539, 'actor/ppo_kl': 0.0005871856701560318}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0001700621360214427, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.06755144894123077, 'actor/pg_clipfrac': 0.0015282731037586927, 'actor/ppo_kl': 0.0006627535913139582}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00012994985445402563, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004232749342918396}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00019173756300006062, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00010519164061406627}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00020848566782660782, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.61259182379581e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.660156786441803, 'actor/pg_clipfrac': 0.003703703638166189, 'actor/ppo_kl': -0.0007552217575721443}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.044636547565460205, 'actor/pg_clipfrac': 0.0019920319318771362, 'actor/ppo_kl': -0.001875125104561448}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00019616918871179223, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000951229187194258}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002060857368633151, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011246540816500783}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00017508547171019018, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001861592405475676}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.3356266915798187, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011689516250044107}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2060432881116867, 'actor/pg_clipfrac': 0.0013140604132786393, 'actor/ppo_kl': 6.947411020519212e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.000186936988029629, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002695116854738444}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00017949742323253304, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007281235884875059}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00027589761884883046, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.487591938115656e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.044732075184583664, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00018985429778695107}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002043184358626604, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006362812709994614}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.05834059417247772, 'actor/pg_clipfrac': 0.000936329597607255, 'actor/ppo_kl': 0.00039094575913622975}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002214801643276587, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000794300576671958}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2746936082839966, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000759077025577426}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.11452639102935791, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010434697614982724}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.09028720110654831, 'actor/pg_clipfrac': 0.0006514657870866358, 'actor/ppo_kl': 0.0013193248305469751}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.0074498625472188, 'actor/pg_clipfrac': 0.0008568980265408754, 'actor/ppo_kl': 0.0013523886445909739}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.281139612197876, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015416917158290744}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.3696739971637726, 'actor/pg_clipfrac': 0.0013262599240988493, 'actor/ppo_kl': 0.0009469151264056563}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.37168192863464355, 'actor/pg_clipfrac': 0.0006489292718470097, 'actor/ppo_kl': -0.0017182005103677511}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.5675545930862427, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001269100612262264}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.35883960127830505, 'actor/pg_clipfrac': 0.0016906170640140772, 'actor/ppo_kl': -0.0015067860949784517}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00018981500761583447, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003545951039995998}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.03616979718208313, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001034954097121954}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.16390369832515717, 'actor/pg_clipfrac': 0.0009165902738459408, 'actor/ppo_kl': -0.0001854551228461787}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:15<1:06:36, 3.13s/it, est. speed input: 146.11 toks/s, output: 21.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:21<40:47, 1.93s/it, est. speed input: 219.14 toks/s, output: 38.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:23<26:52, 1.28s/it, est. speed input: 290.95 toks/s, output: 58.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:23<16:28, 1.27it/s, est. speed input: 379.73 toks/s, output: 77.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:24<11:52, 1.76it/s, est. speed input: 459.55 toks/s, output: 94.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:26<10:58, 1.90it/s, est. speed input: 503.50 toks/s, output: 108.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:27<08:15, 2.51it/s, est. speed input: 569.90 toks/s, output: 124.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:30<09:20, 2.21it/s, est. speed input: 592.57 toks/s, output: 134.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:30<06:46, 3.04it/s, est. speed input: 664.23 toks/s, output: 155.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:31<05:14, 3.91it/s, est. speed input: 725.68 toks/s, output: 177.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:31<04:39, 4.39it/s, est. speed input: 780.04 toks/s, output: 191.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:32<04:11, 4.86it/s, est. speed input: 827.47 toks/s, output: 209.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:32<03:10, 6.37it/s, est. speed input: 890.33 toks/s, output: 231.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:33<02:16, 8.83it/s, est. speed input: 1012.83 toks/s, output: 268.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:34<02:21, 8.46it/s, est. speed input: 1061.28 toks/s, output: 285.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:34<01:58, 10.06it/s, est. speed input: 1118.80 toks/s, output: 304.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:34<01:46, 11.22it/s, est. speed input: 1178.69 toks/s, output: 322.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:35<01:50, 10.72it/s, est. speed input: 1278.91 toks/s, output: 350.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:35<01:31, 12.79it/s, est. speed input: 1336.06 toks/s, output: 372.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:36<01:18, 14.73it/s, est. speed input: 1491.21 toks/s, output: 427.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:36<01:09, 16.62it/s, est. speed input: 1545.62 toks/s, output: 444.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:37<01:01, 18.79it/s, est. speed input: 1607.84 toks/s, output: 468.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<01:01, 18.61it/s, est. speed input: 1658.43 toks/s, output: 490.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:37<00:51, 21.83it/s, est. speed input: 1765.69 toks/s, output: 539.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:37<00:53, 21.00it/s, est. speed input: 1810.03 toks/s, output: 560.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:55, 20.36it/s, est. speed input: 1857.70 toks/s, output: 578.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:38<00:38, 29.03it/s, est. speed input: 1972.59 toks/s, output: 625.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:38<00:38, 28.69it/s, est. speed input: 2021.92 toks/s, output: 645.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:38<00:28, 38.59it/s, est. speed input: 2138.28 toks/s, output: 684.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:38<00:22, 48.02it/s, est. speed input: 2250.37 toks/s, output: 734.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:33, 31.78it/s, est. speed input: 2342.96 toks/s, output: 778.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:39<00:24, 43.33it/s, est. speed input: 2511.95 toks/s, output: 849.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:39<00:20, 50.74it/s, est. speed input: 2623.70 toks/s, output: 901.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:40<00:28, 37.13it/s, est. speed input: 2706.72 toks/s, output: 939.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:40<00:24, 41.82it/s, est. speed input: 2815.25 toks/s, output: 988.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:40<00:23, 43.33it/s, est. speed input: 2915.46 toks/s, output: 1038.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:40<00:23, 42.67it/s, est. speed input: 3060.97 toks/s, output: 1096.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:41<00:26, 37.54it/s, est. speed input: 3151.85 toks/s, output: 1136.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:31, 31.23it/s, est. speed input: 3187.69 toks/s, output: 1149.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:41<00:29, 33.73it/s, est. speed input: 3277.37 toks/s, output: 1204.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:41<00:25, 37.69it/s, est. speed input: 3375.06 toks/s, output: 1258.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:26, 37.08it/s, est. speed input: 3419.60 toks/s, output: 1278.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:42<00:19, 48.90it/s, est. speed input: 3559.98 toks/s, output: 1353.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:42<00:13, 70.56it/s, est. speed input: 3769.75 toks/s, output: 1440.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:42<00:08, 107.38it/s, est. speed input: 4088.92 toks/s, output: 1605.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:42<00:08, 108.61it/s, est. speed input: 4236.41 toks/s, output: 1684.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:42<00:05, 151.14it/s, est. speed input: 4594.12 toks/s, output: 1879.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:43<00:07, 114.73it/s, est. speed input: 4776.62 toks/s, output: 1995.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:43<00:07, 109.45it/s, est. speed input: 4937.43 toks/s, output: 2067.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:43<00:07, 105.44it/s, est. speed input: 5079.77 toks/s, output: 2143.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:43<00:07, 102.11it/s, est. speed input: 5223.27 toks/s, output: 2210.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:43<00:07, 109.59it/s, est. speed input: 5421.05 toks/s, output: 2300.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:43<00:07, 102.56it/s, est. speed input: 5559.08 toks/s, output: 2384.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:43<00:07, 104.31it/s, est. speed input: 5704.25 toks/s, output: 2458.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:44<00:06, 113.57it/s, est. speed input: 5845.62 toks/s, output: 2535.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:44<00:07, 93.81it/s, est. speed input: 5981.27 toks/s, output: 2611.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:44<00:09, 73.55it/s, est. speed input: 6095.61 toks/s, output: 2679.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:44<00:08, 76.55it/s, est. speed input: 6286.59 toks/s, output: 2785.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:45<00:11, 58.62it/s, est. speed input: 6346.56 toks/s, output: 2817.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:45<00:10, 63.02it/s, est. speed input: 6429.34 toks/s, output: 2857.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:45<00:06, 92.10it/s, est. speed input: 6702.42 toks/s, output: 2990.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:45<00:06, 100.64it/s, est. speed input: 6842.89 toks/s, output: 3077.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:45<00:05, 105.97it/s, est. speed input: 6974.71 toks/s, output: 3157.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:45<00:05, 100.72it/s, est. speed input: 7098.57 toks/s, output: 3238.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:45<00:05, 112.17it/s, est. speed input: 7281.09 toks/s, output: 3338.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:46<00:04, 121.18it/s, est. speed input: 7464.20 toks/s, output: 3438.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:46<00:04, 123.62it/s, est. speed input: 7590.81 toks/s, output: 3520.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:46<00:04, 120.88it/s, est. speed input: 7719.50 toks/s, output: 3592.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:46<00:04, 124.67it/s, est. speed input: 7849.78 toks/s, output: 3680.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:46<00:03, 141.26it/s, est. speed input: 8027.41 toks/s, output: 3804.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:46<00:02, 152.88it/s, est. speed input: 8248.55 toks/s, output: 3962.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:47<00:04, 103.75it/s, est. speed input: 8398.47 toks/s, output: 4048.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:47<00:03, 108.41it/s, est. speed input: 8514.86 toks/s, output: 4135.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:47<00:04, 97.87it/s, est. speed input: 8626.69 toks/s, output: 4221.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:47<00:04, 94.70it/s, est. speed input: 8747.18 toks/s, output: 4287.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:47<00:03, 101.71it/s, est. speed input: 8908.59 toks/s, output: 4420.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:47<00:03, 98.66it/s, est. speed input: 9030.31 toks/s, output: 4520.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:47<00:02, 123.09it/s, est. speed input: 9254.60 toks/s, output: 4662.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:48<00:02, 136.72it/s, est. speed input: 9472.81 toks/s, output: 4823.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:48<00:01, 152.65it/s, est. speed input: 9681.70 toks/s, output: 4977.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:48<00:02, 107.86it/s, est. speed input: 9809.47 toks/s, output: 5083.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:48<00:02, 106.42it/s, est. speed input: 9922.07 toks/s, output: 5174.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:49<00:02, 76.84it/s, est. speed input: 9986.72 toks/s, output: 5256.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:49<00:02, 82.11it/s, est. speed input: 10102.31 toks/s, output: 5326.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:49<00:02, 79.27it/s, est. speed input: 10171.96 toks/s, output: 5393.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:49<00:02, 86.16it/s, est. speed input: 10286.59 toks/s, output: 5521.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:49<00:02, 82.11it/s, est. speed input: 10361.21 toks/s, output: 5600.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:49<00:01, 90.45it/s, est. speed input: 10466.36 toks/s, output: 5693.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:49<00:01, 110.85it/s, est. speed input: 10698.52 toks/s, output: 5884.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:50<00:01, 87.12it/s, est. speed input: 10783.32 toks/s, output: 5979.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:50<00:01, 79.56it/s, est. speed input: 10841.36 toks/s, output: 6035.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:50<00:01, 63.12it/s, est. speed input: 10874.27 toks/s, output: 6080.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:50<00:01, 67.94it/s, est. speed input: 10947.01 toks/s, output: 6142.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:50<00:01, 68.01it/s, est. speed input: 11005.44 toks/s, output: 6206.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:51<00:01, 69.30it/s, est. speed input: 11069.86 toks/s, output: 6265.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:51<00:00, 72.01it/s, est. speed input: 11130.12 toks/s, output: 6327.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:51<00:00, 57.90it/s, est. speed input: 11192.80 toks/s, output: 6427.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:51<00:00, 40.85it/s, est. speed input: 11185.73 toks/s, output: 6456.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 40.43it/s, est. speed input: 11271.84 toks/s, output: 6579.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:52<00:00, 39.44it/s, est. speed input: 11286.33 toks/s, output: 6602.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:53<00:00, 26.60it/s, est. speed input: 11223.10 toks/s, output: 6596.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:56<00:00, 5.89it/s, est. speed input: 10520.30 toks/s, output: 6218.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 6.92it/s, est. speed input: 10512.91 toks/s, output: 6254.76 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 22.41it/s, est. speed input: 10512.91 toks/s, output: 6254.76 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.04082630202174187, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005395518383011222}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.22015216946601868, 'actor/pg_clipfrac': 0.0021536252461373806, 'actor/ppo_kl': 0.0013032257556915283}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.294976145029068, 'actor/pg_clipfrac': 0.0009803922148421407, 'actor/ppo_kl': -0.0009467536583542824}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.22080017626285553, 'actor/pg_clipfrac': 0.0010799135779961944, 'actor/ppo_kl': 0.0008520534029230475}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.24831663072109222, 'actor/pg_clipfrac': 0.0013046314707025886, 'actor/ppo_kl': 0.0012780281249433756}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5475397706031799, 'actor/pg_clipfrac': 0.002209944650530815, 'actor/ppo_kl': 0.0007856147713027894}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.12738411128520966, 'actor/pg_clipfrac': 0.0006086427019909024, 'actor/ppo_kl': -0.000560277549084276}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.20668621361255646, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00026514509227126837}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.03408326581120491, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005438824300654233}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00024828442838042974, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.284004095708951e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.22372309863567352, 'actor/pg_clipfrac': 0.0013071895809844136, 'actor/ppo_kl': -0.0008668563095852733}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.43739426136016846, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000899716280400753}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.12942002713680267, 'actor/pg_clipfrac': 0.0016666667070239782, 'actor/ppo_kl': 0.0018895752727985382}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.22463618218898773, 'actor/pg_clipfrac': 0.0029069767333567142, 'actor/ppo_kl': -0.00026820425409823656}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.3507353961467743, 'actor/pg_clipfrac': 0.0016393442638218403, 'actor/ppo_kl': -0.0006052955286577344}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.41057637333869934, 'actor/pg_clipfrac': 0.0009881423320621252, 'actor/ppo_kl': -0.0005862345569767058}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6306070685386658, 'actor/pg_clipfrac': 0.0012853470398113132, 'actor/ppo_kl': 0.0005735174054279923}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.09868171811103821, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011293807765468955}
[36m(Runner pid=3309020)[0m Step 28
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.246
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.016
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.009
[36m(Runner pid=3309020)[0m ppo_kl: 7.676971733872051e-07
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.633
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.633
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 956801
[36m(Runner pid=3309020)[0m balanced_min: 956800
[36m(Runner pid=3309020)[0m max: 962800
[36m(Runner pid=3309020)[0m mean: 956800.5
[36m(Runner pid=3309020)[0m min: 950801
[36m(Runner pid=3309020)[0m minmax_diff: 11999
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 109.812
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.118
[36m(Runner pid=3309020)[0m throughput: 1131.617
[36m(Runner pid=3309020)[0m time_per_step: 845.516
[36m(Runner pid=3309020)[0m total_num_tokens: 1913601
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 465.961
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1683.0
[36m(Runner pid=3309020)[0m mean: 281.539
[36m(Runner pid=3309020)[0m min: 56.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.269
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.633
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.059479010548018e-05
[36m(Runner pid=3309020)[0m gen: 0.142
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.294
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.173
[36m(Runner pid=3309020)[0m gen: 102.562
[36m(Runner pid=3309020)[0m old: 86.294
[36m(Runner pid=3309020)[0m ref: 87.302
[36m(Runner pid=3309020)[0m reward: 6.53
[36m(Runner pid=3309020)[0m step: 845.516
[36m(Runner pid=3309020)[0m update_actor: 562.03
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 29; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:27:27 [executor_base.py:219] It took 0.340368 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:28:54 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:27:27 [executor_base.py:219] It took 0.339876 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:28:54 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:28:54 [executor_base.py:208] It took 0.327958 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.80 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.89 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:28:56 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:28:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.89 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:28:56 [executor_base.py:208] It took 0.327436 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.17234894633293152, 'actor/pg_clipfrac': 0.0028985508251935244, 'actor/ppo_kl': -0.00016374449478462338}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.14488688111305237, 'actor/pg_clipfrac': 0.00081300811143592, 'actor/ppo_kl': -1.5041692904560477e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.12056435644626617, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.5102819800376892, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.21878337860107422, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003315231006126851, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3157230019569397, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005392176099121571}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.23196204006671906, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016576998168602586}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.38430190086364746, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.05085837095975876, 'actor/pg_clipfrac': 0.001429592608474195, 'actor/ppo_kl': -0.0017193323001265526}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00016442895866930485, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.8919426202774048, 'actor/pg_clipfrac': 0.0009578543831594288, 'actor/ppo_kl': 0.0008399751386605203}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1583620011806488, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.3256475627422333, 'actor/pg_clipfrac': 0.004228329751640558, 'actor/ppo_kl': -0.002265853574499488}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0634850561618805, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00021217871108092368, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00020053505431860685, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005199990700930357}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.06214482709765434, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015549364034086466}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.35594892501831055, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011704877251759171}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.08942772448062897, 'actor/pg_clipfrac': 0.0009124087519012392, 'actor/ppo_kl': -0.0016279428964480758}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00014826073311269283, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015669394051656127}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.33902597427368164, 'actor/pg_clipfrac': 0.0006079027079977095, 'actor/ppo_kl': -0.0009091420797631145}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.19800616800785065, 'actor/pg_clipfrac': 0.0007215007208287716, 'actor/ppo_kl': -0.0016816732240840793}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.06469520926475525, 'actor/pg_clipfrac': 0.0012690355069935322, 'actor/ppo_kl': -0.001151731121353805}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.020484188571572304, 'actor/pg_clipfrac': 0.0008077544625848532, 'actor/ppo_kl': -0.00012653825979214162}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.268706738948822, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002737262984737754}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3780909776687622, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00030660664197057486}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00021673778246622533, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005169874639250338}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00019703675934579223, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009760167449712753}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.3269801437854767, 'actor/pg_clipfrac': 0.001825928222388029, 'actor/ppo_kl': -0.0005307908868417144}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.18616490066051483, 'actor/pg_clipfrac': 0.000518403307069093, 'actor/ppo_kl': -0.0014802879886701703}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.23586505651474, 'actor/pg_clipfrac': 0.0006439150311052799, 'actor/ppo_kl': 8.407563291257247e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.21993482112884521, 'actor/pg_clipfrac': 0.0018416206585243344, 'actor/ppo_kl': -0.0012290877057239413}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0001926143595483154, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009504568879492581}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.23671546578407288, 'actor/pg_clipfrac': 0.002679169410839677, 'actor/ppo_kl': -0.0009572833077982068}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00016147547285072505, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00016037623572628945}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.1182335689663887, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011874315096065402}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.1571238934993744, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014161302242428064}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1383116990327835, 'actor/pg_clipfrac': 0.000860585190821439, 'actor/ppo_kl': -0.0013385090278461576}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00019245439034420997, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014918509405106306}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.07519327849149704, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000715699337888509}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0001855343289207667, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006677620694972575}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.14936062693595886, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.9098619304713793e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0002904733701143414, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001908477977849543}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.08345531672239304, 'actor/pg_clipfrac': 0.0041551245376467705, 'actor/ppo_kl': 0.0006955323624424636}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00019686869927681983, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -4.3721731344703585e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.410968154668808, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000936431810259819}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.1863947957754135, 'actor/pg_clipfrac': 0.005444645881652832, 'actor/ppo_kl': -0.0003835709358099848}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1427648365497589, 'actor/pg_clipfrac': 0.002680965233594179, 'actor/ppo_kl': -0.00012607421376742423}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5364125967025757, 'actor/pg_clipfrac': 0.003267973894253373, 'actor/ppo_kl': -9.583492328602006e-07}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.05752892419695854, 'actor/pg_clipfrac': 0.002336448524147272, 'actor/ppo_kl': -0.0016968651907518506}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.48020458221435547, 'actor/pg_clipfrac': 0.0035419126506894827, 'actor/ppo_kl': 0.0001304856123169884}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00017911303439177573, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0028214931953698397}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.13323654234409332, 'actor/pg_clipfrac': 0.001733102253638208, 'actor/ppo_kl': 0.0006207130500115454}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.19748522341251373, 'actor/pg_clipfrac': 0.0012787723680958152, 'actor/ppo_kl': -0.0009200127678923309}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.8031814694404602, 'actor/pg_clipfrac': 0.0019841270986944437, 'actor/ppo_kl': -0.00020360000780783594}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5377243757247925, 'actor/pg_clipfrac': 0.0012391573982313275, 'actor/ppo_kl': -0.0019175754860043526}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.3066267967224121, 'actor/pg_clipfrac': 0.0028382213786244392, 'actor/ppo_kl': 0.001847302308306098}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.11132896691560745, 'actor/pg_clipfrac': 0.001243008067831397, 'actor/ppo_kl': 0.0009608319378457963}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.09559841454029083, 'actor/pg_clipfrac': 0.0005646527279168367, 'actor/ppo_kl': -1.3888858120481018e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00013046854292042553, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00021720354561693966}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00018142811313737184, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010618368396535516}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3898780643939972, 'actor/pg_clipfrac': 0.0016313213855028152, 'actor/ppo_kl': 0.0010057577164843678}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00019345252076163888, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008072437485679984}
[36m(Runner pid=3309020)[0m Step 29
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.257
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.018
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.004
[36m(Runner pid=3309020)[0m ppo_kl: -6.0024028082850125e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.651
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.651
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 959956
[36m(Runner pid=3309020)[0m balanced_min: 959955
[36m(Runner pid=3309020)[0m max: 962217
[36m(Runner pid=3309020)[0m mean: 959955.5
[36m(Runner pid=3309020)[0m min: 957694
[36m(Runner pid=3309020)[0m minmax_diff: 4523
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 103.222
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.117
[36m(Runner pid=3309020)[0m throughput: 1127.039
[36m(Runner pid=3309020)[0m time_per_step: 851.75
[36m(Runner pid=3309020)[0m total_num_tokens: 1919911
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 807.0
[36m(Runner pid=3309020)[0m mean: 466.375
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1584.0
[36m(Runner pid=3309020)[0m mean: 283.59
[36m(Runner pid=3309020)[0m min: 63.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.304
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.651
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.142
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.294
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.192
[36m(Runner pid=3309020)[0m gen: 103.239
[36m(Runner pid=3309020)[0m old: 87.333
[36m(Runner pid=3309020)[0m ref: 88.813
[36m(Runner pid=3309020)[0m reward: 6.318
[36m(Runner pid=3309020)[0m step: 851.75
[36m(Runner pid=3309020)[0m update_actor: 565.203
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 30; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:15<1:07:57, 3.20s/it, est. speed input: 143.53 toks/s, output: 23.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:22<44:19, 2.09s/it, est. speed input: 204.87 toks/s, output: 37.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:24<26:50, 1.27s/it, est. speed input: 287.85 toks/s, output: 59.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:26<20:47, 1.01it/s, est. speed input: 346.48 toks/s, output: 70.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:29<17:44, 1.18it/s, est. speed input: 387.70 toks/s, output: 84.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<12:16, 1.70it/s, est. speed input: 461.52 toks/s, output: 104.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<08:27, 2.45it/s, est. speed input: 539.28 toks/s, output: 123.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<07:12, 2.87it/s, est. speed input: 593.84 toks/s, output: 141.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:31<03:08, 6.51it/s, est. speed input: 803.57 toks/s, output: 205.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:32<02:17, 8.85it/s, est. speed input: 942.54 toks/s, output: 242.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:32<02:15, 8.93it/s, est. speed input: 998.49 toks/s, output: 261.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:33<02:05, 9.61it/s, est. speed input: 1055.96 toks/s, output: 282.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:33<01:34, 12.58it/s, est. speed input: 1238.33 toks/s, output: 346.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:34<01:36, 12.28it/s, est. speed input: 1288.63 toks/s, output: 361.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:34<01:29, 13.15it/s, est. speed input: 1388.48 toks/s, output: 391.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:35<01:04, 18.05it/s, est. speed input: 1511.52 toks/s, output: 431.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:35<01:01, 18.90it/s, est. speed input: 1572.83 toks/s, output: 453.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:35<01:01, 18.55it/s, est. speed input: 1678.50 toks/s, output: 494.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:35<00:55, 20.58it/s, est. speed input: 1737.89 toks/s, output: 514.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:36<00:49, 22.81it/s, est. speed input: 1797.04 toks/s, output: 539.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:36<00:48, 23.19it/s, est. speed input: 1845.28 toks/s, output: 555.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:36<00:51, 21.76it/s, est. speed input: 1890.91 toks/s, output: 571.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:36<00:39, 28.49it/s, est. speed input: 2000.82 toks/s, output: 619.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:37<00:50, 21.92it/s, est. speed input: 2037.52 toks/s, output: 633.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:37<00:55, 20.10it/s, est. speed input: 2083.13 toks/s, output: 654.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:37<00:54, 20.29it/s, est. speed input: 2128.42 toks/s, output: 674.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:37<00:46, 23.60it/s, est. speed input: 2177.13 toks/s, output: 699.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:38<01:02, 17.59it/s, est. speed input: 2213.30 toks/s, output: 717.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:38<00:41, 26.02it/s, est. speed input: 2367.76 toks/s, output: 791.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:38<00:31, 34.23it/s, est. speed input: 2473.35 toks/s, output: 842.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:38<00:29, 35.68it/s, est. speed input: 2522.88 toks/s, output: 866.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:39<00:14, 69.85it/s, est. speed input: 2813.56 toks/s, output: 988.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:39<00:16, 61.11it/s, est. speed input: 2916.11 toks/s, output: 1035.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:39<00:15, 66.77it/s, est. speed input: 3022.04 toks/s, output: 1077.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:39<00:14, 71.79it/s, est. speed input: 3137.46 toks/s, output: 1127.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:39<00:11, 87.96it/s, est. speed input: 3310.35 toks/s, output: 1209.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:39<00:11, 81.81it/s, est. speed input: 3461.16 toks/s, output: 1261.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:39<00:12, 76.52it/s, est. speed input: 3560.98 toks/s, output: 1303.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:40<00:19, 49.23it/s, est. speed input: 3641.05 toks/s, output: 1339.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:40<00:18, 52.70it/s, est. speed input: 3735.80 toks/s, output: 1387.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:40<00:18, 51.73it/s, est. speed input: 3832.64 toks/s, output: 1437.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:41<00:20, 44.84it/s, est. speed input: 3912.84 toks/s, output: 1472.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:41<00:18, 49.72it/s, est. speed input: 4012.03 toks/s, output: 1526.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:41<00:15, 57.73it/s, est. speed input: 4159.58 toks/s, output: 1615.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:41<00:15, 57.14it/s, est. speed input: 4304.98 toks/s, output: 1691.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:41<00:14, 61.14it/s, est. speed input: 4404.06 toks/s, output: 1730.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:41<00:13, 64.83it/s, est. speed input: 4501.93 toks/s, output: 1781.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:42<00:11, 77.14it/s, est. speed input: 4659.07 toks/s, output: 1857.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:42<00:14, 58.66it/s, est. speed input: 4732.21 toks/s, output: 1900.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:42<00:11, 70.85it/s, est. speed input: 4951.77 toks/s, output: 2019.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:42<00:13, 61.22it/s, est. speed input: 5034.99 toks/s, output: 2060.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:42<00:14, 55.09it/s, est. speed input: 5115.83 toks/s, output: 2100.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:43<00:16, 48.77it/s, est. speed input: 5186.41 toks/s, output: 2140.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:43<00:12, 60.72it/s, est. speed input: 5329.13 toks/s, output: 2222.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:43<00:11, 68.06it/s, est. speed input: 5518.43 toks/s, output: 2307.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:43<00:10, 73.67it/s, est. speed input: 5659.97 toks/s, output: 2368.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:43<00:07, 99.21it/s, est. speed input: 5903.73 toks/s, output: 2505.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:44<00:06, 103.53it/s, est. speed input: 6037.95 toks/s, output: 2606.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:44<00:07, 95.68it/s, est. speed input: 6166.63 toks/s, output: 2709.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:44<00:06, 101.46it/s, est. speed input: 6304.09 toks/s, output: 2780.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:44<00:06, 100.72it/s, est. speed input: 6438.83 toks/s, output: 2872.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:44<00:04, 137.78it/s, est. speed input: 6739.17 toks/s, output: 3044.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:44<00:05, 113.22it/s, est. speed input: 6860.53 toks/s, output: 3123.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:45<00:05, 100.05it/s, est. speed input: 6985.89 toks/s, output: 3193.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:45<00:06, 86.65it/s, est. speed input: 7184.91 toks/s, output: 3332.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:45<00:05, 101.33it/s, est. speed input: 7380.30 toks/s, output: 3443.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:45<00:04, 115.38it/s, est. speed input: 7567.00 toks/s, output: 3560.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:45<00:05, 88.65it/s, est. speed input: 7667.13 toks/s, output: 3632.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:46<00:05, 85.47it/s, est. speed input: 7782.71 toks/s, output: 3725.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:46<00:04, 104.03it/s, est. speed input: 7974.10 toks/s, output: 3862.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:46<00:04, 108.57it/s, est. speed input: 8128.67 toks/s, output: 3943.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:46<00:03, 122.97it/s, est. speed input: 8301.80 toks/s, output: 4079.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:46<00:03, 134.93it/s, est. speed input: 8487.29 toks/s, output: 4216.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:46<00:04, 97.81it/s, est. speed input: 8587.06 toks/s, output: 4298.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:47<00:03, 101.22it/s, est. speed input: 8829.84 toks/s, output: 4470.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:47<00:03, 112.79it/s, est. speed input: 9052.55 toks/s, output: 4625.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:47<00:04, 85.86it/s, est. speed input: 9144.45 toks/s, output: 4684.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:47<00:02, 109.01it/s, est. speed input: 9375.72 toks/s, output: 4845.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:47<00:02, 140.30it/s, est. speed input: 9642.29 toks/s, output: 5073.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:48<00:02, 129.16it/s, est. speed input: 9788.77 toks/s, output: 5194.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:48<00:02, 98.16it/s, est. speed input: 9918.29 toks/s, output: 5307.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:48<00:02, 94.53it/s, est. speed input: 10020.06 toks/s, output: 5411.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:48<00:01, 111.42it/s, est. speed input: 10228.18 toks/s, output: 5570.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:48<00:01, 105.46it/s, est. speed input: 10339.29 toks/s, output: 5672.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:49<00:01, 97.73it/s, est. speed input: 10438.43 toks/s, output: 5764.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:49<00:01, 103.95it/s, est. speed input: 10562.55 toks/s, output: 5853.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:49<00:01, 101.89it/s, est. speed input: 10685.12 toks/s, output: 5978.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:49<00:01, 82.52it/s, est. speed input: 10769.21 toks/s, output: 6070.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:49<00:01, 77.78it/s, est. speed input: 10825.32 toks/s, output: 6133.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:50<00:02, 55.51it/s, est. speed input: 10842.41 toks/s, output: 6166.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:50<00:02, 48.10it/s, est. speed input: 10877.70 toks/s, output: 6208.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:50<00:02, 43.88it/s, est. speed input: 10913.36 toks/s, output: 6244.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:50<00:01, 58.05it/s, est. speed input: 11033.89 toks/s, output: 6387.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:51<00:00, 72.81it/s, est. speed input: 11208.20 toks/s, output: 6591.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:51<00:00, 71.07it/s, est. speed input: 11260.44 toks/s, output: 6666.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:51<00:00, 53.47it/s, est. speed input: 11278.83 toks/s, output: 6740.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:51<00:00, 48.12it/s, est. speed input: 11332.84 toks/s, output: 6832.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:52<00:00, 40.88it/s, est. speed input: 11348.13 toks/s, output: 6878.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:52<00:00, 43.43it/s, est. speed input: 11397.85 toks/s, output: 6974.54 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:52<00:00, 24.40it/s, est. speed input: 11397.85 toks/s, output: 6974.54 toks/s]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.65 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:41:40 [executor_base.py:219] It took 0.340483 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.57 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:43:07 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:41:40 [executor_base.py:219] It took 0.341120 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:43:07 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:43:07 [executor_base.py:208] It took 0.327360 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.82 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:43:12 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:43:12 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:43:12 [executor_base.py:208] It took 0.327882 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.09294965118169785, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.6862766742706299, 'actor/pg_clipfrac': 0.001402524532750249, 'actor/ppo_kl': 0.00045080037671141326}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.3251124322414398, 'actor/pg_clipfrac': 0.004137930925935507, 'actor/ppo_kl': -0.00033887141034938395}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00022721078130416572, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.06938306242227554, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.16606462001800537, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.2622925341129303, 'actor/pg_clipfrac': 0.001700680237263441, 'actor/ppo_kl': 0.0006913457764312625}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.29204660654067993, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.600469708442688, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003088070952799171}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.5227500796318054, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3387373685836792, 'actor/pg_clipfrac': 0.0005717552849091589, 'actor/ppo_kl': 0.00018316217756364495}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00020172959193587303, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.2627282738685608, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0024747576098889112}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0001880335039459169, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.0730181634426117, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.23318590223789215, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.15439192950725555, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002325486857444048}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.6561609506607056, 'actor/pg_clipfrac': 0.0006402048747986555, 'actor/ppo_kl': -0.0007467709365300834}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00019205218995921314, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003916611021850258}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.005348702426999807, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005420753732323647}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.3529031276702881, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000675435469020158}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.8001721501350403, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.003515678457915783}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.000358879187842831, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007531098090112209}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.1798761785030365, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006589769036509097}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.249922975897789, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.148915104451589e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00018583591736387461, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002000261563807726}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.8900852203369141, 'actor/pg_clipfrac': 0.0020242915488779545, 'actor/ppo_kl': 0.0009551241528242826}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.15039144456386566, 'actor/pg_clipfrac': 0.0006451613153330982, 'actor/ppo_kl': -0.00019723462173715234}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00016080877685453743, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001541359961265698}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00012437975965440273, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006475882837548852}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3375163674354553, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00029122334672138095}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.2361493706703186, 'actor/pg_clipfrac': 0.000649772584438324, 'actor/ppo_kl': 0.0005285660736262798}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00024067910271696746, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009124884963966906}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00010664101137081161, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008149718050844967}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:14<1:33:33, 14.93s/it, est. speed input: 30.14 toks/s, output: 4.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<40:44, 6.52s/it, est. speed input: 58.93 toks/s, output: 10.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 4/377 [00:15<15:30, 2.49s/it, est. speed input: 117.59 toks/s, output: 22.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%|▏ | 5/377 [00:15<10:59, 1.77s/it, est. speed input: 146.14 toks/s, output: 28.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 9/377 [00:16<04:11, 1.46it/s, est. speed input: 258.99 toks/s, output: 53.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 11/377 [00:16<02:59, 2.04it/s, est. speed input: 313.78 toks/s, output: 67.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 13/377 [00:16<02:10, 2.79it/s, est. speed input: 365.76 toks/s, output: 80.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 15/377 [00:16<01:39, 3.66it/s, est. speed input: 418.93 toks/s, output: 94.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 24/377 [00:16<00:37, 9.39it/s, est. speed input: 671.29 toks/s, output: 158.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 29/377 [00:16<00:27, 12.74it/s, est. speed input: 804.17 toks/s, output: 194.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 33/377 [00:17<00:22, 15.33it/s, est. speed input: 905.33 toks/s, output: 223.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 38/377 [00:17<00:18, 18.56it/s, est. speed input: 1033.06 toks/s, output: 262.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 42/377 [00:17<00:15, 20.97it/s, est. speed input: 1134.69 toks/s, output: 292.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 48/377 [00:17<00:12, 26.55it/s, est. speed input: 1285.40 toks/s, output: 340.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 52/377 [00:17<00:11, 27.80it/s, est. speed input: 1379.57 toks/s, output: 372.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 57/377 [00:17<00:10, 30.89it/s, est. speed input: 1504.76 toks/s, output: 411.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 61/377 [00:17<00:10, 31.28it/s, est. speed input: 1597.39 toks/s, output: 444.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 65/377 [00:17<00:10, 29.76it/s, est. speed input: 1686.77 toks/s, output: 476.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 69/377 [00:18<00:11, 27.07it/s, est. speed input: 1771.32 toks/s, output: 507.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 75/377 [00:18<00:09, 32.68it/s, est. speed input: 1911.70 toks/s, output: 561.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 81/377 [00:18<00:07, 37.18it/s, est. speed input: 2050.11 toks/s, output: 614.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▎ | 89/377 [00:18<00:06, 45.36it/s, est. speed input: 2236.21 toks/s, output: 687.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 98/377 [00:18<00:05, 54.33it/s, est. speed input: 2450.70 toks/s, output: 769.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 104/377 [00:18<00:06, 44.72it/s, est. speed input: 2574.42 toks/s, output: 821.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 109/377 [00:18<00:05, 44.67it/s, est. speed input: 2683.55 toks/s, output: 867.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 114/377 [00:19<00:06, 41.95it/s, est. speed input: 2787.06 toks/s, output: 912.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 121/377 [00:19<00:05, 47.31it/s, est. speed input: 2939.52 toks/s, output: 980.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 129/377 [00:19<00:04, 52.73it/s, est. speed input: 3113.25 toks/s, output: 1059.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 138/377 [00:19<00:04, 59.35it/s, est. speed input: 3309.54 toks/s, output: 1151.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 151/377 [00:19<00:03, 74.44it/s, est. speed input: 3603.49 toks/s, output: 1287.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 159/377 [00:19<00:03, 65.41it/s, est. speed input: 3761.25 toks/s, output: 1368.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 166/377 [00:19<00:03, 61.24it/s, est. speed input: 3899.94 toks/s, output: 1440.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 173/377 [00:19<00:03, 62.15it/s, est. speed input: 4045.06 toks/s, output: 1514.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 180/377 [00:20<00:03, 62.76it/s, est. speed input: 4188.54 toks/s, output: 1589.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 187/377 [00:20<00:03, 57.71it/s, est. speed input: 4324.25 toks/s, output: 1662.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 193/377 [00:20<00:03, 58.10it/s, est. speed input: 4441.70 toks/s, output: 1728.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 199/377 [00:20<00:03, 55.62it/s, est. speed input: 4559.74 toks/s, output: 1794.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 210/377 [00:20<00:02, 63.70it/s, est. speed input: 4776.70 toks/s, output: 1922.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 224/377 [00:20<00:01, 79.59it/s, est. speed input: 5063.44 toks/s, output: 2093.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 233/377 [00:20<00:01, 72.32it/s, est. speed input: 5232.29 toks/s, output: 2197.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 241/377 [00:20<00:01, 72.37it/s, est. speed input: 5390.09 toks/s, output: 2294.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 249/377 [00:21<00:01, 66.58it/s, est. speed input: 5533.88 toks/s, output: 2390.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 258/377 [00:21<00:01, 63.59it/s, est. speed input: 5697.12 toks/s, output: 2499.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 265/377 [00:21<00:01, 59.37it/s, est. speed input: 5816.82 toks/s, output: 2584.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 272/377 [00:21<00:01, 59.50it/s, est. speed input: 5946.25 toks/s, output: 2675.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 284/377 [00:21<00:01, 72.46it/s, est. speed input: 6177.52 toks/s, output: 2842.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 293/377 [00:21<00:01, 73.18it/s, est. speed input: 6342.25 toks/s, output: 2966.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 301/377 [00:21<00:01, 74.05it/s, est. speed input: 6486.21 toks/s, output: 3078.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 309/377 [00:22<00:01, 62.80it/s, est. speed input: 6609.17 toks/s, output: 3181.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 319/377 [00:22<00:00, 66.55it/s, est. speed input: 6785.43 toks/s, output: 3326.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 326/377 [00:22<00:00, 61.41it/s, est. speed input: 6891.98 toks/s, output: 3424.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 333/377 [00:22<00:00, 55.51it/s, est. speed input: 6994.38 toks/s, output: 3521.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 339/377 [00:22<00:00, 51.17it/s, est. speed input: 7076.29 toks/s, output: 3607.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:22<00:00, 42.73it/s, est. speed input: 7136.13 toks/s, output: 3684.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 350/377 [00:23<00:00, 29.61it/s, est. speed input: 7133.49 toks/s, output: 3728.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 354/377 [00:23<00:00, 25.56it/s, est. speed input: 7143.43 toks/s, output: 3773.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 358/377 [00:23<00:00, 24.33it/s, est. speed input: 7166.30 toks/s, output: 3828.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:23<00:00, 19.48it/s, est. speed input: 7143.38 toks/s, output: 3852.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 364/377 [00:24<00:00, 15.39it/s, est. speed input: 7103.67 toks/s, output: 3869.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [00:24<00:00, 14.13it/s, est. speed input: 7085.99 toks/s, output: 3887.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [00:25<00:01, 8.23it/s, est. speed input: 6940.58 toks/s, output: 3841.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [00:25<00:01, 6.52it/s, est. speed input: 6837.54 toks/s, output: 3819.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:25<00:01, 5.56it/s, est. speed input: 6764.03 toks/s, output: 3799.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▊| 372/377 [00:26<00:00, 5.46it/s, est. speed input: 6729.97 toks/s, output: 3801.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:27<00:01, 2.89it/s, est. speed input: 6489.00 toks/s, output: 3690.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [00:48<00:14, 4.85s/it, est. speed input: 3630.87 toks/s, output: 2134.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 375/377 [00:53<00:09, 4.78s/it, est. speed input: 3331.48 toks/s, output: 2031.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 376/377 [01:03<00:06, 6.12s/it, est. speed input: 2805.09 toks/s, output: 1793.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 377/377 [01:03<00:00, 5.94it/s, est. speed input: 2808.37 toks/s, output: 1877.11 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00016822037287056446, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005749698611907661}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.08324199169874191, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003701562818605453}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.10525131970643997, 'actor/pg_clipfrac': 0.0006153846043162048, 'actor/ppo_kl': 0.00023220531875267625}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.05460220202803612, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 9.419097477803007e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2552737891674042, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0019182151881977916}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002412160683888942, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016239503165706992}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.060096628963947296, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011982480064034462}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.0766952782869339, 'actor/pg_clipfrac': 0.0020408162381500006, 'actor/ppo_kl': -0.0005635047564283013}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3345893919467926, 'actor/pg_clipfrac': 0.004870129749178886, 'actor/ppo_kl': 0.0012631106656044722}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0002887570299208164, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001535853953100741}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.3602662980556488, 'actor/pg_clipfrac': 0.004248088225722313, 'actor/ppo_kl': 0.0001557948999106884}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00029248170903883874, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00042792409658432007}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0001578577939653769, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004883378860540688}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.014658128842711449, 'actor/pg_clipfrac': 0.0006729474989697337, 'actor/ppo_kl': 0.0009568036184646189}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.1281558722257614, 'actor/pg_clipfrac': 0.001721170381642878, 'actor/ppo_kl': -0.0005723326466977596}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.10445795208215714, 'actor/pg_clipfrac': 0.0051124743185937405, 'actor/ppo_kl': 0.000529913348145783}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.05659923702478409, 'actor/pg_clipfrac': 0.0030487803742289543, 'actor/ppo_kl': -0.0003549796820152551}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.12989142537117004, 'actor/pg_clipfrac': 0.003518648911267519, 'actor/ppo_kl': 0.0008716744487173855}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.11741706728935242, 'actor/pg_clipfrac': 0.002758620772510767, 'actor/ppo_kl': -0.0016815501730889082}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.22806741297245026, 'actor/pg_clipfrac': 0.0005344735691323876, 'actor/ppo_kl': -0.00045058486284688115}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.08003329485654831, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0030216032173484564}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.03946569934487343, 'actor/pg_clipfrac': 0.002218934940174222, 'actor/ppo_kl': -0.002011695643886924}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1305919587612152, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00018028907652478665}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.2373298853635788, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010208766907453537}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00018946481577586383, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009881336009129882}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.4009556770324707, 'actor/pg_clipfrac': 0.0023310023825615644, 'actor/ppo_kl': 0.0013536817859858274}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002480318071320653, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002692272246349603}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.050572626292705536, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.848530367482454e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.14924557507038116, 'actor/pg_clipfrac': 0.001988071482628584, 'actor/ppo_kl': -0.0004994826740585268}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00017423630924895406, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0021569104865193367}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.66 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:56:20 [executor_base.py:219] It took 0.339756 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.58 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:58:09 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:56:20 [executor_base.py:219] It took 0.340077 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:58:09 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:58:09 [executor_base.py:208] It took 0.327556 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:58:11 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:58:11 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:58:11 [executor_base.py:208] It took 0.327102 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to understand the geometric properties involved. Since D is the midpoint of AB and l is a vertical line passing through D, it implies that E is the reflection of A across l. This means that AE is equal to ED, and the distance from E to C is the same as the distance from A to C when considering the reflection.\n\nGiven that the perimeter of triangle ABC is 16.0 and BD = 2.0, we can deduce that AB = 4.0 (since D is the midpoint of AB). The perimeter of triangle ABC is the sum of its sides: AB + BC + CA = 16.0. Since AB = 4.0, we have BC + CA = 12.0.\n\nTo minimize the perimeter of triangle AEC, we need to consider the reflection of A across l. The minimum perimeter of triangle AEC is achieved when E is the point on l such that AE + EC is minimized. Since E is the reflection of A across l, AE = ED and the distance from E to C is the same as the distance from A to C.\n\nThe minimum perimeter of triangle AEC is the same as the length of AC because AE + EC = AC when E is the reflection of A across l. Since AB + BC + CA = 16.0 and AB = 4.0, we have BC + CA = 12.0. The minimum perimeter of triangle AEC is therefore the length of AC, which is half of BC + CA because AC is the base when E is the reflection of A across l.\n\nSince AB = 4.0, BC + CA = 12.0, and the reflection of A across l makes AE = ED and EC = CA, the minimum perimeter of triangle AEC is the length of AC, which is half of BC + CA. Therefore, the minimum perimeter of triangle AEC is 6.0.\n\n\nThe minimum perimeter of triangle AEC is 6.0.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated 90 degrees clockwise around point A to form triangle AB'C', we know that angle BAC = 90 degrees. Since the rotation is 90 degrees, angle B'AC' = 90 degrees as well. Because the rotation is around point A, AB = AB' and AC = AC'. This means that triangle AB'C' is congruent to triangle ABC by the Side-Angle-Side (SAS) criterion.\n\nSince angle CC'B' = 32 degrees, and angle C'AC = 90 degrees (because of the rotation), we can find angle AC'B' by subtracting the given angle from 90 degrees. This is because angle AC'B' is the remaining angle in the right triangle formed by the rotation, which is 90 degrees minus the given angle CC'B'.\n\n\nThe size of angle AC'B' is 58 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that ABCD is a parallelogram, and A, B, and D are points on circle O, while C is on the diameter BE. Since BE is a diameter, angle BAE is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. \n\nGiven that angle ADC = 54 degrees, and since ABCD is a parallelogram, angle ABC = angle ADC = 54 degrees. \n\nNow, in triangle ABE, angle ABE is a right angle (90 degrees) since BE is the diameter. \n\nSo, angle AEB can be found by subtracting angle ABE and angle BAE from 180 degrees (since the sum of angles in a triangle is 180 degrees). \n\nTherefore, angle AEB = 180 degrees - 90 degrees - 54 degrees = 36 degrees.\n\n\nThe degree of angle AEB is 36 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find angle C, we need to use properties of circles and tangents. Since AB is the diameter of circle O and CD is tangent to the circle at point D, angle ADB is a right angle (90 degrees) because the angle between a tangent and a radius at the point of tangency is always 90 degrees.\n\nGiven that angle A is 35 degrees, and since angle ADB is a right angle, we can use the fact that the sum of angles in a triangle is 180 degrees. Therefore, angle DBC can be found by subtracting angle A from angle ADB:\n\nangle DBC = 90 - angle A = 90 - 35 = 55 degrees\n\nSince angle C is an inscribed angle that intercepts the same arc as angle DBC, it is half of angle DBC:\n\nangle C = 1/2 * angle DBC = 1/2 * 55 = 27.5 degrees\n\nTherefore, angle C = 27.5 degrees.\n\n\n27.5\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:26<1:51:45, 5.26s/it, est. speed input: 81.00 toks/s, output: 24.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:27<49:11, 2.32s/it, est. speed input: 160.24 toks/s, output: 45.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:28<27:21, 1.30s/it, est. speed input: 239.93 toks/s, output: 66.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:31<22:15, 1.06s/it, est. speed input: 285.23 toks/s, output: 76.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:32<16:10, 1.29it/s, est. speed input: 346.35 toks/s, output: 94.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:33<11:11, 1.86it/s, est. speed input: 411.99 toks/s, output: 116.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:35<07:56, 2.60it/s, est. speed input: 515.02 toks/s, output: 151.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:35<04:49, 4.25it/s, est. speed input: 643.43 toks/s, output: 195.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<03:07, 6.51it/s, est. speed input: 768.34 toks/s, output: 238.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<02:23, 8.44it/s, est. speed input: 882.39 toks/s, output: 270.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:36<01:41, 11.88it/s, est. speed input: 1009.32 toks/s, output: 316.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:37<01:54, 10.45it/s, est. speed input: 1052.82 toks/s, output: 330.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<01:43, 11.46it/s, est. speed input: 1107.35 toks/s, output: 349.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<01:13, 16.10it/s, est. speed input: 1229.87 toks/s, output: 398.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<00:58, 19.95it/s, est. speed input: 1342.91 toks/s, output: 437.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:38<00:51, 22.55it/s, est. speed input: 1507.05 toks/s, output: 498.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:50, 22.88it/s, est. speed input: 1554.98 toks/s, output: 521.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:38, 29.56it/s, est. speed input: 1658.51 toks/s, output: 569.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:39<00:30, 36.61it/s, est. speed input: 1771.99 toks/s, output: 614.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:39<00:25, 43.64it/s, est. speed input: 1884.89 toks/s, output: 665.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:39<00:24, 45.61it/s, est. speed input: 1993.24 toks/s, output: 709.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:23, 46.36it/s, est. speed input: 2208.74 toks/s, output: 802.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:22, 48.44it/s, est. speed input: 2308.74 toks/s, output: 840.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:24, 43.37it/s, est. speed input: 2407.32 toks/s, output: 879.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:21, 50.01it/s, est. speed input: 2517.91 toks/s, output: 933.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:28, 36.47it/s, est. speed input: 2607.35 toks/s, output: 974.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:40<00:20, 50.10it/s, est. speed input: 2761.50 toks/s, output: 1051.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:41<00:18, 56.41it/s, est. speed input: 2868.36 toks/s, output: 1099.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:16, 62.62it/s, est. speed input: 2973.66 toks/s, output: 1149.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:20, 47.94it/s, est. speed input: 3061.69 toks/s, output: 1189.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:41<00:15, 63.62it/s, est. speed input: 3220.59 toks/s, output: 1277.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:42<00:22, 44.32it/s, est. speed input: 3299.80 toks/s, output: 1313.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:19, 48.56it/s, est. speed input: 3412.93 toks/s, output: 1365.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:42<00:13, 72.36it/s, est. speed input: 3616.97 toks/s, output: 1464.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:42<00:15, 58.89it/s, est. speed input: 3750.23 toks/s, output: 1530.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:42<00:15, 60.35it/s, est. speed input: 3848.12 toks/s, output: 1584.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:42<00:14, 62.01it/s, est. speed input: 3942.81 toks/s, output: 1632.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:43<00:18, 48.06it/s, est. speed input: 4015.34 toks/s, output: 1667.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:15, 55.88it/s, est. speed input: 4157.76 toks/s, output: 1752.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:16, 52.84it/s, est. speed input: 4238.66 toks/s, output: 1793.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:43<00:13, 65.26it/s, est. speed input: 4391.80 toks/s, output: 1854.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:44<00:12, 66.81it/s, est. speed input: 4520.77 toks/s, output: 1929.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:44<00:17, 48.79it/s, est. speed input: 4579.95 toks/s, output: 1974.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:44<00:15, 54.25it/s, est. speed input: 4679.51 toks/s, output: 2027.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:44<00:16, 49.57it/s, est. speed input: 4754.42 toks/s, output: 2054.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:44<00:10, 74.25it/s, est. speed input: 4995.99 toks/s, output: 2179.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:45<00:10, 76.09it/s, est. speed input: 5080.57 toks/s, output: 2224.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:45<00:10, 72.43it/s, est. speed input: 5166.61 toks/s, output: 2264.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:45<00:10, 75.44it/s, est. speed input: 5256.13 toks/s, output: 2312.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:45<00:13, 57.85it/s, est. speed input: 5321.19 toks/s, output: 2355.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:45<00:10, 73.99it/s, est. speed input: 5466.21 toks/s, output: 2434.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:45<00:11, 65.50it/s, est. speed input: 5545.44 toks/s, output: 2492.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:46<00:10, 68.97it/s, est. speed input: 5675.47 toks/s, output: 2562.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:46<00:11, 60.64it/s, est. speed input: 5744.09 toks/s, output: 2616.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:46<00:10, 66.27it/s, est. speed input: 5877.33 toks/s, output: 2696.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:46<00:10, 65.87it/s, est. speed input: 5960.36 toks/s, output: 2747.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:46<00:07, 87.66it/s, est. speed input: 6142.10 toks/s, output: 2866.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:46<00:06, 96.46it/s, est. speed input: 6279.13 toks/s, output: 2951.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:47<00:06, 103.82it/s, est. speed input: 6411.36 toks/s, output: 3047.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:47<00:05, 110.12it/s, est. speed input: 6546.94 toks/s, output: 3141.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:47<00:04, 130.32it/s, est. speed input: 6778.87 toks/s, output: 3296.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:47<00:04, 134.16it/s, est. speed input: 6912.61 toks/s, output: 3386.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:47<00:04, 117.56it/s, est. speed input: 7035.59 toks/s, output: 3466.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:47<00:05, 103.59it/s, est. speed input: 7157.53 toks/s, output: 3532.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:47<00:03, 144.55it/s, est. speed input: 7471.70 toks/s, output: 3708.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:48<00:03, 122.57it/s, est. speed input: 7632.95 toks/s, output: 3842.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:48<00:03, 120.76it/s, est. speed input: 7758.87 toks/s, output: 3935.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:48<00:04, 110.84it/s, est. speed input: 7874.77 toks/s, output: 4026.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:48<00:04, 91.07it/s, est. speed input: 7975.69 toks/s, output: 4102.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:48<00:04, 86.50it/s, est. speed input: 8090.26 toks/s, output: 4173.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:49<00:05, 78.91it/s, est. speed input: 8185.65 toks/s, output: 4266.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:49<00:05, 79.01it/s, est. speed input: 8294.47 toks/s, output: 4354.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:03, 95.02it/s, est. speed input: 8454.14 toks/s, output: 4460.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:49<00:03, 101.86it/s, est. speed input: 8565.94 toks/s, output: 4545.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:49<00:03, 112.52it/s, est. speed input: 8725.27 toks/s, output: 4671.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:49<00:02, 125.41it/s, est. speed input: 8890.14 toks/s, output: 4798.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:50<00:02, 103.37it/s, est. speed input: 8986.29 toks/s, output: 4878.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:50<00:02, 106.95it/s, est. speed input: 9102.40 toks/s, output: 4974.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:50<00:03, 79.48it/s, est. speed input: 9185.48 toks/s, output: 5032.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:50<00:01, 130.32it/s, est. speed input: 9534.39 toks/s, output: 5324.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:50<00:01, 124.21it/s, est. speed input: 9687.70 toks/s, output: 5443.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:51<00:01, 112.71it/s, est. speed input: 9793.88 toks/s, output: 5545.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:51<00:02, 92.91it/s, est. speed input: 9878.72 toks/s, output: 5626.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:51<00:01, 91.16it/s, est. speed input: 9987.79 toks/s, output: 5733.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:51<00:01, 89.08it/s, est. speed input: 10088.70 toks/s, output: 5817.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:02, 68.65it/s, est. speed input: 10125.85 toks/s, output: 5863.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:52<00:01, 74.82it/s, est. speed input: 10232.91 toks/s, output: 5980.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:52<00:01, 94.93it/s, est. speed input: 10418.01 toks/s, output: 6178.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:52<00:00, 102.57it/s, est. speed input: 10529.74 toks/s, output: 6287.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:52<00:00, 99.11it/s, est. speed input: 10634.97 toks/s, output: 6411.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:53<00:01, 54.28it/s, est. speed input: 10642.95 toks/s, output: 6482.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:53<00:00, 69.44it/s, est. speed input: 10793.07 toks/s, output: 6663.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:53<00:00, 53.51it/s, est. speed input: 10827.51 toks/s, output: 6753.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:54<00:00, 27.15it/s, est. speed input: 10703.17 toks/s, output: 6724.90 toks/s]
[36m(Runner pid=3309020)[0m [output] ['\nTo solve this problem, we need to understand the properties of a circle and the angles formed by the tangents and chords.\n\n1. Since AD is the diameter of circle O, angle ABD is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle.\n2. EA is the tangent to the circle at point A, which means that the radius OA is perpendicular to the tangent EA at the point of tangency A. Therefore, angle OAE is a right angle (90 degrees).\n3. Since angle EAC is given as 120 degrees, and we know that angle OAE is 90 degrees, we can find angle OAD by subtracting 90 degrees from 120 degrees, which gives us 30 degrees.\n4. Since AD is the diameter, angle ACD is a right angle (90 degrees) because the angle subtended by a diameter in a semicircle is a right angle.\n5. Now, we need to find angle ABC. Since angle ABD is 90 degrees and angle ACD is 90 degrees, the sum of these two angles is 180 degrees. Therefore, angle ABC must be the remaining angle in the triangle ABD, which is 90 degrees.\n\n\nThe degree of angle ABC is 30 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_15
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_30/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_30/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_30/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 30
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.271
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.024
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.005
[36m(Runner pid=3309020)[0m ppo_kl: -8.497871398169821e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.011
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.011
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.643
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.643
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 953729
[36m(Runner pid=3309020)[0m balanced_min: 953383
[36m(Runner pid=3309020)[0m max: 963959
[36m(Runner pid=3309020)[0m mean: 953556.0
[36m(Runner pid=3309020)[0m min: 943153
[36m(Runner pid=3309020)[0m minmax_diff: 20806
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 108.29
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.117
[36m(Runner pid=3309020)[0m throughput: 913.779
[36m(Runner pid=3309020)[0m time_per_step: 1043.531
[36m(Runner pid=3309020)[0m total_num_tokens: 1907112
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 693.0
[36m(Runner pid=3309020)[0m mean: 464.383
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1955.0
[36m(Runner pid=3309020)[0m mean: 280.583
[36m(Runner pid=3309020)[0m min: 51.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.288
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.643
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.65144061158411e-05
[36m(Runner pid=3309020)[0m gen: 0.148
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.296
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.146
[36m(Runner pid=3309020)[0m gen: 106.604
[36m(Runner pid=3309020)[0m old: 83.681
[36m(Runner pid=3309020)[0m ref: 85.757
[36m(Runner pid=3309020)[0m reward: 6.116
[36m(Runner pid=3309020)[0m save_checkpoint: 33.28
[36m(Runner pid=3309020)[0m step: 1043.531
[36m(Runner pid=3309020)[0m update_actor: 564.92
[36m(Runner pid=3309020)[0m validation: 162.309
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.33
[36m(Runner pid=3309020)[0m format_reward: 0.991
[36m(Runner pid=3309020)[0m overall_reward: 0.661
[36m(Runner pid=3309020)[0m reward_score: 0.661
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.993
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_30/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_30/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_30/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m Training Episode 2.
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 31; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 04:59:32 [executor_base.py:219] It took 0.344764 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:55<00:00, 24.66it/s, est. speed input: 10682.63 toks/s, output: 6743.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:55<00:00, 22.87it/s, est. speed input: 10666.14 toks/s, output: 6765.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 16.49it/s, est. speed input: 10570.62 toks/s, output: 6731.87 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.72it/s, est. speed input: 10570.62 toks/s, output: 6731.87 toks/s]
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:01:01 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 04:59:32 [executor_base.py:219] It took 0.341550 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:01:01 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:01:01 [executor_base.py:208] It took 0.327793 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:01:03 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:01:03 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:01:03 [executor_base.py:208] It took 0.326038 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00019455395522527397, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007194890640676022}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.39342474937438965, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.380597859621048, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.20906367897987366, 'actor/pg_clipfrac': 0.0015723269898444414, 'actor/ppo_kl': -0.0007789119845256209}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.294100284576416, 'actor/pg_clipfrac': 0.004040404222905636, 'actor/ppo_kl': -0.0016641173278912902}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.20640641450881958, 'actor/pg_clipfrac': 0.00453172205016017, 'actor/ppo_kl': 0.0008556994143873453}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.23098692297935486, 'actor/pg_clipfrac': 0.00296296295709908, 'actor/ppo_kl': 0.002465167548507452}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.09248083829879761, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.49291786551475525, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.06864633411169052, 'actor/pg_clipfrac': 0.0034168565180152655, 'actor/ppo_kl': -0.0018915054388344288}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0001493646705057472, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.058588314801454544, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001266114879399538}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.05965037643909454, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.3298785090446472, 'actor/pg_clipfrac': 0.0020222447346895933, 'actor/ppo_kl': -0.0008863481925800443}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00016812614921946079, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.487515389919281, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -2.102363396261353e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0001513238385086879, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016642623813822865}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.30979305505752563, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004185859870631248}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0001988119911402464, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0027059840504080057}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.25724315643310547, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007569681038148701}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00014844229735899717, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005041025578975677}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.06081486493349075, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012098555453121662}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.14431776106357574, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016587020363658667}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0873105451464653, 'actor/pg_clipfrac': 0.0006090134265832603, 'actor/ppo_kl': -0.0003025743644684553}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00026969643658958375, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007541980012319982}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.10993833839893341, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000634560885373503}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.07745503634214401, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005947251338511705}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.21574735641479492, 'actor/pg_clipfrac': 0.0006038647261448205, 'actor/ppo_kl': -0.0016430207761004567}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.055808670818805695, 'actor/pg_clipfrac': 0.0009149130783043802, 'actor/ppo_kl': 0.0007491377764381468}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.15681001543998718, 'actor/pg_clipfrac': 0.0006939625018276274, 'actor/ppo_kl': 0.00016881160263437778}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0001760150771588087, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017125696176663041}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.20881657302379608, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00022546127729583532}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00013063866936136037, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004942708183079958}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.11080227792263031, 'actor/pg_clipfrac': 0.0011123470030725002, 'actor/ppo_kl': 0.0005679740570485592}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.19496528804302216, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007071273867040873}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0002004761918215081, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001547197112813592}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00014995313540566713, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00102525413967669}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.054058466106653214, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004337248974479735}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00015808948955964297, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010257893009111285}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.16088873147964478, 'actor/pg_clipfrac': 0.0008097165846265852, 'actor/ppo_kl': -0.0004871739074587822}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1484280675649643, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000967330124694854}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.12398282438516617, 'actor/pg_clipfrac': 0.0010141987586393952, 'actor/ppo_kl': -8.821584196994081e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.37321555614471436, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0025551430881023407}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5174919366836548, 'actor/pg_clipfrac': 0.0025575447361916304, 'actor/ppo_kl': 0.0015704975230619311}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.3827197253704071, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000880848616361618}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.03936187922954559, 'actor/pg_clipfrac': 0.0027649770490825176, 'actor/ppo_kl': -0.002232541795819998}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3767082393169403, 'actor/pg_clipfrac': 0.0011709601385518909, 'actor/ppo_kl': 0.0011034547351300716}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.6904870867729187, 'actor/pg_clipfrac': 0.002288329415023327, 'actor/ppo_kl': -0.00042075151577591896}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.4475991427898407, 'actor/pg_clipfrac': 0.0019588638097047806, 'actor/ppo_kl': 0.0002631506067700684}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.47885242104530334, 'actor/pg_clipfrac': 0.0010542962700128555, 'actor/ppo_kl': 0.0016632270999252796}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00015451735816895962, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002199914597440511}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.08316149562597275, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008534647058695555}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.8147887587547302, 'actor/pg_clipfrac': 0.0030959751456975937, 'actor/ppo_kl': 0.0010949878487735987}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0001669877237873152, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00021743203978985548}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.021589845418930054, 'actor/pg_clipfrac': 0.0023510970640927553, 'actor/ppo_kl': 0.0006245266413316131}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.16721795499324799, 'actor/pg_clipfrac': 0.000834724516607821, 'actor/ppo_kl': -0.0012521600583568215}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.000194466148968786, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00036975540569983423}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.18872325122356415, 'actor/pg_clipfrac': 0.001114827231504023, 'actor/ppo_kl': -0.002097358461469412}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00021504081087186933, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015516628045588732}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2007092535495758, 'actor/pg_clipfrac': 0.002772643230855465, 'actor/ppo_kl': 0.0007782881730236113}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1529465913772583, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -7.47953963582404e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0001721422013361007, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000558297848328948}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.18074052035808563, 'actor/pg_clipfrac': 0.0012690355069935322, 'actor/ppo_kl': 0.000917347555514425}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00014333210128825158, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013770181685686111}
[36m(Runner pid=3309020)[0m Step 31
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.234
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.014
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.014
[36m(Runner pid=3309020)[0m ppo_kl: 3.169355595744605e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.013
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.013
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.63
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.63
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 967735
[36m(Runner pid=3309020)[0m balanced_min: 967734
[36m(Runner pid=3309020)[0m max: 974535
[36m(Runner pid=3309020)[0m mean: 967734.5
[36m(Runner pid=3309020)[0m min: 960934
[36m(Runner pid=3309020)[0m minmax_diff: 13601
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 110.852
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.119
[36m(Runner pid=3309020)[0m throughput: 1144.208
[36m(Runner pid=3309020)[0m time_per_step: 845.768
[36m(Runner pid=3309020)[0m total_num_tokens: 1935469
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 693.0
[36m(Runner pid=3309020)[0m mean: 466.137
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1382.0
[36m(Runner pid=3309020)[0m mean: 289.906
[36m(Runner pid=3309020)[0m min: 59.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.264
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.63
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.555354645206097e-05
[36m(Runner pid=3309020)[0m gen: 0.143
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(Runner pid=3309020)[0m update_actor: 0.289
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.146
[36m(Runner pid=3309020)[0m gen: 106.092
[36m(Runner pid=3309020)[0m old: 85.726
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:17<1:13:10, 3.44s/it, est. speed input: 131.83 toks/s, output: 24.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<50:17, 2.38s/it, est. speed input: 181.18 toks/s, output: 39.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<27:56, 1.33s/it, est. speed input: 269.23 toks/s, output: 60.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<19:34, 1.07it/s, est. speed input: 336.70 toks/s, output: 82.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<14:03, 1.49it/s, est. speed input: 403.01 toks/s, output: 104.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:29<10:25, 2.00it/s, est. speed input: 471.40 toks/s, output: 124.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:29<07:12, 2.88it/s, est. speed input: 547.73 toks/s, output: 143.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:29<05:27, 3.78it/s, est. speed input: 617.14 toks/s, output: 162.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:32<05:08, 3.98it/s, est. speed input: 719.13 toks/s, output: 195.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:33<04:42, 4.34it/s, est. speed input: 771.04 toks/s, output: 211.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:33<03:46, 5.39it/s, est. speed input: 830.24 toks/s, output: 230.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:33<02:53, 6.99it/s, est. speed input: 889.86 toks/s, output: 247.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:33<02:19, 8.65it/s, est. speed input: 950.14 toks/s, output: 264.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:34<01:45, 11.39it/s, est. speed input: 1064.04 toks/s, output: 305.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:34<01:54, 10.47it/s, est. speed input: 1112.96 toks/s, output: 323.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:33, 12.72it/s, est. speed input: 1176.01 toks/s, output: 341.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:35<01:07, 17.44it/s, est. speed input: 1293.91 toks/s, output: 379.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:35<01:25, 13.67it/s, est. speed input: 1334.32 toks/s, output: 396.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:36<01:03, 18.26it/s, est. speed input: 1454.87 toks/s, output: 443.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:36<00:45, 25.15it/s, est. speed input: 1569.65 toks/s, output: 486.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:36<00:35, 32.48it/s, est. speed input: 1691.62 toks/s, output: 528.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:36<00:24, 46.33it/s, est. speed input: 1877.78 toks/s, output: 601.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:36<00:21, 52.08it/s, est. speed input: 1992.95 toks/s, output: 650.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:37<00:29, 37.60it/s, est. speed input: 2088.74 toks/s, output: 694.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:37<00:34, 32.24it/s, est. speed input: 2182.77 toks/s, output: 738.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:37<00:32, 33.68it/s, est. speed input: 2237.61 toks/s, output: 762.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:37<00:27, 38.95it/s, est. speed input: 2350.23 toks/s, output: 813.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:38<00:21, 50.42it/s, est. speed input: 2533.16 toks/s, output: 884.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:38<00:23, 44.70it/s, est. speed input: 2632.15 toks/s, output: 930.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:38<00:25, 41.41it/s, est. speed input: 2730.87 toks/s, output: 961.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:38<00:18, 56.24it/s, est. speed input: 2899.83 toks/s, output: 1031.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:38<00:16, 62.27it/s, est. speed input: 3011.81 toks/s, output: 1078.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:39<00:14, 68.10it/s, est. speed input: 3126.93 toks/s, output: 1129.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:39<00:13, 73.14it/s, est. speed input: 3235.37 toks/s, output: 1175.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:39<00:15, 63.44it/s, est. speed input: 3332.67 toks/s, output: 1227.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:39<00:16, 58.24it/s, est. speed input: 3432.78 toks/s, output: 1278.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:39<00:22, 44.25it/s, est. speed input: 3524.30 toks/s, output: 1322.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:40<00:19, 48.99it/s, est. speed input: 3622.91 toks/s, output: 1368.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:40<00:20, 45.92it/s, est. speed input: 3717.42 toks/s, output: 1410.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:40<00:18, 50.63it/s, est. speed input: 3818.11 toks/s, output: 1460.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:40<00:14, 62.95it/s, est. speed input: 3979.51 toks/s, output: 1525.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:40<00:15, 59.32it/s, est. speed input: 4072.78 toks/s, output: 1558.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:41<00:18, 49.50it/s, est. speed input: 4154.44 toks/s, output: 1602.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:41<00:22, 40.48it/s, est. speed input: 4231.27 toks/s, output: 1645.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:41<00:13, 67.16it/s, est. speed input: 4497.68 toks/s, output: 1766.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:41<00:12, 68.34it/s, est. speed input: 4645.91 toks/s, output: 1830.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:41<00:13, 62.66it/s, est. speed input: 4729.48 toks/s, output: 1867.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:42<00:14, 58.64it/s, est. speed input: 4816.01 toks/s, output: 1921.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:42<00:11, 74.50it/s, est. speed input: 5013.58 toks/s, output: 2007.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:42<00:10, 76.30it/s, est. speed input: 5106.27 toks/s, output: 2059.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:42<00:10, 77.90it/s, est. speed input: 5207.85 toks/s, output: 2114.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:42<00:09, 79.87it/s, est. speed input: 5301.28 toks/s, output: 2159.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:42<00:10, 75.28it/s, est. speed input: 5394.96 toks/s, output: 2214.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:43<00:07, 96.12it/s, est. speed input: 5646.99 toks/s, output: 2347.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:43<00:07, 96.31it/s, est. speed input: 5839.93 toks/s, output: 2453.40 toks/s]
Processed prompts: 45%|████▍ | 570/1280 [00:43<00:05, 126.23it/s, est. speed input: 6090.93 toks/s, output: 2585.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:43<00:08, 83.27it/s, est. speed input: 6202.03 toks/s, output: 2648.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:43<00:08, 78.13it/s, est. speed input: 6332.62 toks/s, output: 2729.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:44<00:07, 94.13it/s, est. speed input: 6523.71 toks/s, output: 2848.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:44<00:08, 79.20it/s, est. speed input: 6642.27 toks/s, output: 2926.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:44<00:05, 107.02it/s, est. speed input: 6937.58 toks/s, output: 3082.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:44<00:04, 125.02it/s, est. speed input: 7208.80 toks/s, output: 3231.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:44<00:04, 122.96it/s, est. speed input: 7392.42 toks/s, output: 3333.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:44<00:04, 123.06it/s, est. speed input: 7525.57 toks/s, output: 3413.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:45<00:04, 127.51it/s, est. speed input: 7708.26 toks/s, output: 3513.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:45<00:03, 137.82it/s, est. speed input: 7894.76 toks/s, output: 3639.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:45<00:03, 140.17it/s, est. speed input: 8072.19 toks/s, output: 3757.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:45<00:03, 139.43it/s, est. speed input: 8210.45 toks/s, output: 3849.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:45<00:04, 96.81it/s, est. speed input: 8320.58 toks/s, output: 3931.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:45<00:05, 88.92it/s, est. speed input: 8443.71 toks/s, output: 4019.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:46<00:03, 109.29it/s, est. speed input: 8640.83 toks/s, output: 4151.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:46<00:03, 117.70it/s, est. speed input: 8767.37 toks/s, output: 4242.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:46<00:02, 153.13it/s, est. speed input: 9042.65 toks/s, output: 4426.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:46<00:02, 150.38it/s, est. speed input: 9256.79 toks/s, output: 4584.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:46<00:02, 139.54it/s, est. speed input: 9423.99 toks/s, output: 4694.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:46<00:02, 145.83it/s, est. speed input: 9603.63 toks/s, output: 4818.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:46<00:01, 155.33it/s, est. speed input: 9786.85 toks/s, output: 4952.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:47<00:02, 126.05it/s, est. speed input: 9977.73 toks/s, output: 5082.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:47<00:02, 124.82it/s, est. speed input: 10106.83 toks/s, output: 5173.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:47<00:02, 86.61it/s, est. speed input: 10181.98 toks/s, output: 5246.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:47<00:02, 87.01it/s, est. speed input: 10291.84 toks/s, output: 5343.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:47<00:02, 83.80it/s, est. speed input: 10399.93 toks/s, output: 5444.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:48<00:02, 91.91it/s, est. speed input: 10551.86 toks/s, output: 5568.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:48<00:01, 98.88it/s, est. speed input: 10671.60 toks/s, output: 5664.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:48<00:01, 89.36it/s, est. speed input: 10764.25 toks/s, output: 5732.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:48<00:01, 99.93it/s, est. speed input: 10913.89 toks/s, output: 5910.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:48<00:01, 96.14it/s, est. speed input: 11018.66 toks/s, output: 6039.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:49<00:01, 86.71it/s, est. speed input: 11104.84 toks/s, output: 6138.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:49<00:01, 83.68it/s, est. speed input: 11168.18 toks/s, output: 6213.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:49<00:01, 55.70it/s, est. speed input: 11173.40 toks/s, output: 6262.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:49<00:00, 76.43it/s, est. speed input: 11365.62 toks/s, output: 6465.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:50<00:01, 41.03it/s, est. speed input: 11301.76 toks/s, output: 6480.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:50<00:01, 41.84it/s, est. speed input: 11351.49 toks/s, output: 6559.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:51<00:00, 40.36it/s, est. speed input: 11397.85 toks/s, output: 6630.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:51<00:00, 37.22it/s, est. speed input: 11416.59 toks/s, output: 6677.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:51<00:00, 38.51it/s, est. speed input: 11442.56 toks/s, output: 6709.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:51<00:00, 38.84it/s, est. speed input: 11460.62 toks/s, output: 6750.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:53<00:00, 11.35it/s, est. speed input: 11134.50 toks/s, output: 6579.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 11.78it/s, est. speed input: 11105.84 toks/s, output: 6608.65 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 23.86it/s, est. speed input: 11105.84 toks/s, output: 6608.65 toks/s]
[36m(Runner pid=3309020)[0m ref: 86.005
[36m(Runner pid=3309020)[0m reward: 7.504
[36m(Runner pid=3309020)[0m step: 845.768
[36m(Runner pid=3309020)[0m update_actor: 559.595
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 32; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:13:38 [executor_base.py:219] It took 0.338978 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.65 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.75 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:13:38 [executor_base.py:219] It took 0.339492 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:15:03 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:15:04 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:15:04 [executor_base.py:208] It took 0.325305 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:15:08 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:15:08 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:15:08 [executor_base.py:208] It took 0.329779 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0001284247264266014, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.9662104023154825e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00021369967726059258, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00016626797150820494, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.36552897095680237, 'actor/pg_clipfrac': 0.0016652789199724793, 'actor/ppo_kl': -0.00065138895297423}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.18764428794384003, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00023386820976156741, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018774338532239199}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.10639181733131409, 'actor/pg_clipfrac': 0.0010752688394859433, 'actor/ppo_kl': -0.00027241860516369343}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.13042299449443817, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.047516945749521255, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.22644926607608795, 'actor/pg_clipfrac': 0.0007616146467626095, 'actor/ppo_kl': 0.0011320949997752905}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.106253981590271, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010324218310415745}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1972026526927948, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.13014373183250427, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00014752881543245167, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.03461039811372757, 'actor/pg_clipfrac': 0.0007037297473289073, 'actor/ppo_kl': 0.0003235445765312761}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.11407215893268585, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.10897713899612427, 'actor/pg_clipfrac': 0.0007923930534161627, 'actor/ppo_kl': -6.296970968833193e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.35173115134239197, 'actor/pg_clipfrac': 0.001450326293706894, 'actor/ppo_kl': 0.0007444288930855691}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.155294269323349, 'actor/pg_clipfrac': 0.0016528925625607371, 'actor/ppo_kl': 0.0007210109033621848}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5215609073638916, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011931398184970021}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.22358223795890808, 'actor/pg_clipfrac': 0.0005577245028689504, 'actor/ppo_kl': -0.0002678213350009173}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.10208424180746078, 'actor/pg_clipfrac': 0.0014556040987372398, 'actor/ppo_kl': -0.0008813894237391651}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.3574083745479584, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008444045670330524}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.30542755126953125, 'actor/pg_clipfrac': 0.0020242915488779545, 'actor/ppo_kl': -6.564908835571259e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.08826509118080139, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001385434065014124}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.31701645255088806, 'actor/pg_clipfrac': 0.0015698587521910667, 'actor/ppo_kl': -0.0005255030118860304}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.19328239560127258, 'actor/pg_clipfrac': 0.0010976948542520404, 'actor/ppo_kl': 0.00032631782232783735}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.09977465122938156, 'actor/pg_clipfrac': 0.0015360983088612556, 'actor/ppo_kl': -0.00037282073753885925}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.026854943484067917, 'actor/pg_clipfrac': 0.0015037594130262733, 'actor/ppo_kl': 0.0010589513694867492}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00019740483548957855, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001318856026045978}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.2664013206958771, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007013892754912376}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.000285036105196923, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00037994474405422807}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.1570201814174652, 'actor/pg_clipfrac': 0.003552397945895791, 'actor/ppo_kl': 0.0007539379876106977}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.418105810880661, 'actor/pg_clipfrac': 0.003335556946694851, 'actor/ppo_kl': 0.0005295558366924524}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0003404223534744233, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.090062793693505e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.7261368036270142, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005683919880539179}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.44795647263526917, 'actor/pg_clipfrac': 0.002290076343342662, 'actor/ppo_kl': -0.0014892083127051592}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003176546306349337, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0029948751907795668}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4093250334262848, 'actor/pg_clipfrac': 0.00162601622287184, 'actor/ppo_kl': 0.0025722626596689224}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.3144915997982025, 'actor/pg_clipfrac': 0.0006142506026662886, 'actor/ppo_kl': -0.0005908592138439417}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.16870011389255524, 'actor/pg_clipfrac': 0.002583979396149516, 'actor/ppo_kl': 0.0006089190137572587}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0001615332585060969, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00040661031380295753}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.31090062856674194, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005578677519224584}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.6588366627693176, 'actor/pg_clipfrac': 0.002989536616951227, 'actor/ppo_kl': -0.002869764342904091}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.25193914771080017, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0020037514623254538}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00020873693574685603, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005896819639019668}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00018718320643529296, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00037956490996293724}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00019840101595036685, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003474444674793631}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.24563829600811005, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003329831233713776}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00014938008098397404, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007579647935926914}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.32911065220832825, 'actor/pg_clipfrac': 0.0010319917928427458, 'actor/ppo_kl': 0.0005248102243058383}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.6024962067604065, 'actor/pg_clipfrac': 0.00136892544105649, 'actor/ppo_kl': 0.0008720525656826794}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3187809884548187, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011916636722162366}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.3752010762691498, 'actor/pg_clipfrac': 0.0031023784540593624, 'actor/ppo_kl': -0.0016775989206507802}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.17527414858341217, 'actor/pg_clipfrac': 0.0007716049440205097, 'actor/ppo_kl': -0.0002604310866445303}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.07474752515554428, 'actor/pg_clipfrac': 0.002912621246650815, 'actor/ppo_kl': 0.0021773329935967922}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.7229673266410828, 'actor/pg_clipfrac': 0.001752848387695849, 'actor/ppo_kl': 0.0005860173841938376}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.24963577091693878, 'actor/pg_clipfrac': 0.0033783784601837397, 'actor/ppo_kl': 0.0018181779887527227}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00018216457101516426, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -8.093086944427341e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.44771963357925415, 'actor/pg_clipfrac': 0.0016420361353084445, 'actor/ppo_kl': -0.0001848937972681597}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.25737255811691284, 'actor/pg_clipfrac': 0.0007733951788395643, 'actor/ppo_kl': -0.00131011544726789}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.33397048711776733, 'actor/pg_clipfrac': 0.0017108640167862177, 'actor/ppo_kl': 0.0011327089741826057}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.5066841244697571, 'actor/pg_clipfrac': 0.003754693316295743, 'actor/ppo_kl': -0.0006906130001880229}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0970613956451416, 'actor/pg_clipfrac': 0.000939849647693336, 'actor/ppo_kl': 0.0007413204293698072}
[36m(Runner pid=3309020)[0m Step 32
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.259
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.023
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.008
[36m(Runner pid=3309020)[0m ppo_kl: 4.6952785585219205e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.64
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.64
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 960307
[36m(Runner pid=3309020)[0m balanced_min: 960307
[36m(Runner pid=3309020)[0m max: 970197
[36m(Runner pid=3309020)[0m mean: 960307.0
[36m(Runner pid=3309020)[0m min: 950417
[36m(Runner pid=3309020)[0m minmax_diff: 19780
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.742
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.118
[36m(Runner pid=3309020)[0m throughput: 1133.998
[36m(Runner pid=3309020)[0m time_per_step: 846.833
[36m(Runner pid=3309020)[0m total_num_tokens: 1920614
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:35:26, 4.49s/it, est. speed input: 99.08 toks/s, output: 21.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:27<51:10, 2.42s/it, est. speed input: 168.95 toks/s, output: 43.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:29<21:51, 1.04s/it, est. speed input: 312.80 toks/s, output: 77.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<16:27, 1.27it/s, est. speed input: 374.65 toks/s, output: 100.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<11:55, 1.75it/s, est. speed input: 443.34 toks/s, output: 119.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<09:30, 2.18it/s, est. speed input: 501.34 toks/s, output: 135.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<06:36, 3.11it/s, est. speed input: 604.12 toks/s, output: 167.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<05:28, 3.75it/s, est. speed input: 661.94 toks/s, output: 185.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<04:26, 4.60it/s, est. speed input: 716.61 toks/s, output: 207.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<03:22, 5.99it/s, est. speed input: 826.50 toks/s, output: 242.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<02:47, 7.22it/s, est. speed input: 888.43 toks/s, output: 263.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:45, 7.27it/s, est. speed input: 935.92 toks/s, output: 279.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:36<01:31, 13.06it/s, est. speed input: 1121.04 toks/s, output: 343.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:36<01:22, 14.29it/s, est. speed input: 1176.93 toks/s, output: 365.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<01:22, 14.23it/s, est. speed input: 1228.49 toks/s, output: 387.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:37<01:18, 14.91it/s, est. speed input: 1280.59 toks/s, output: 404.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:54, 21.09it/s, est. speed input: 1442.41 toks/s, output: 458.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:38<00:50, 22.93it/s, est. speed input: 1497.76 toks/s, output: 476.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:46, 24.90it/s, est. speed input: 1554.44 toks/s, output: 497.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:40, 28.33it/s, est. speed input: 1658.66 toks/s, output: 532.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:38<00:33, 33.43it/s, est. speed input: 1767.33 toks/s, output: 572.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:39<00:29, 37.68it/s, est. speed input: 1877.42 toks/s, output: 614.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:39<00:24, 45.00it/s, est. speed input: 1987.60 toks/s, output: 666.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:25, 43.48it/s, est. speed input: 2088.01 toks/s, output: 706.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:23, 46.74it/s, est. speed input: 2197.15 toks/s, output: 758.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:25, 41.96it/s, est. speed input: 2291.89 toks/s, output: 794.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:23, 45.57it/s, est. speed input: 2399.57 toks/s, output: 840.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:23, 45.11it/s, est. speed input: 2447.19 toks/s, output: 864.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:23, 44.72it/s, est. speed input: 2498.17 toks/s, output: 885.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:19, 54.56it/s, est. speed input: 2597.85 toks/s, output: 926.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:40<00:32, 31.95it/s, est. speed input: 2675.68 toks/s, output: 965.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:16, 61.01it/s, est. speed input: 2953.78 toks/s, output: 1087.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:41<00:19, 50.04it/s, est. speed input: 3087.25 toks/s, output: 1145.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:41<00:19, 49.82it/s, est. speed input: 3180.90 toks/s, output: 1195.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:41<00:19, 49.58it/s, est. speed input: 3276.54 toks/s, output: 1239.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:18, 53.12it/s, est. speed input: 3377.96 toks/s, output: 1293.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:42<00:14, 63.73it/s, est. speed input: 3524.20 toks/s, output: 1374.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:42<00:19, 48.70it/s, est. speed input: 3604.76 toks/s, output: 1416.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:42<00:20, 46.47it/s, est. speed input: 3695.44 toks/s, output: 1466.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:42<00:14, 61.44it/s, est. speed input: 3897.65 toks/s, output: 1575.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:43<00:14, 64.20it/s, est. speed input: 3998.19 toks/s, output: 1630.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:43<00:13, 67.12it/s, est. speed input: 4092.15 toks/s, output: 1672.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:43<00:12, 69.60it/s, est. speed input: 4185.43 toks/s, output: 1724.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:11, 75.63it/s, est. speed input: 4317.41 toks/s, output: 1798.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:43<00:08, 96.47it/s, est. speed input: 4512.41 toks/s, output: 1894.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:43<00:08, 103.13it/s, est. speed input: 4656.23 toks/s, output: 1968.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:43<00:07, 108.52it/s, est. speed input: 4808.35 toks/s, output: 2060.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:43<00:07, 113.33it/s, est. speed input: 4952.67 toks/s, output: 2143.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:44<00:07, 107.18it/s, est. speed input: 5099.79 toks/s, output: 2231.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:44<00:07, 105.37it/s, est. speed input: 5287.27 toks/s, output: 2333.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:44<00:06, 110.40it/s, est. speed input: 5425.57 toks/s, output: 2404.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:44<00:06, 105.85it/s, est. speed input: 5562.89 toks/s, output: 2488.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:44<00:07, 95.67it/s, est. speed input: 5741.92 toks/s, output: 2596.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:45<00:06, 108.25it/s, est. speed input: 5930.71 toks/s, output: 2714.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:45<00:06, 103.27it/s, est. speed input: 6069.82 toks/s, output: 2786.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:45<00:07, 94.96it/s, est. speed input: 6201.93 toks/s, output: 2858.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:45<00:06, 95.14it/s, est. speed input: 6327.76 toks/s, output: 2932.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:45<00:05, 119.14it/s, est. speed input: 6651.98 toks/s, output: 3083.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:45<00:05, 108.92it/s, est. speed input: 6773.18 toks/s, output: 3152.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:46<00:05, 107.45it/s, est. speed input: 6901.20 toks/s, output: 3249.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:46<00:05, 112.68it/s, est. speed input: 7034.38 toks/s, output: 3324.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:46<00:05, 104.72it/s, est. speed input: 7154.71 toks/s, output: 3390.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:46<00:04, 108.14it/s, est. speed input: 7370.15 toks/s, output: 3543.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:46<00:04, 107.08it/s, est. speed input: 7533.98 toks/s, output: 3644.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:46<00:03, 130.03it/s, est. speed input: 7763.75 toks/s, output: 3778.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:47<00:03, 138.02it/s, est. speed input: 7979.56 toks/s, output: 3932.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:47<00:03, 126.18it/s, est. speed input: 8098.68 toks/s, output: 4005.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:47<00:03, 109.22it/s, est. speed input: 8212.79 toks/s, output: 4062.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:47<00:03, 129.83it/s, est. speed input: 8440.16 toks/s, output: 4195.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:47<00:03, 122.57it/s, est. speed input: 8557.54 toks/s, output: 4288.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:47<00:03, 123.62it/s, est. speed input: 8725.55 toks/s, output: 4410.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:47<00:03, 110.64it/s, est. speed input: 8836.54 toks/s, output: 4496.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:48<00:03, 95.41it/s, est. speed input: 8939.40 toks/s, output: 4588.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:48<00:03, 89.99it/s, est. speed input: 9077.15 toks/s, output: 4697.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:48<00:04, 63.99it/s, est. speed input: 9113.85 toks/s, output: 4740.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:48<00:05, 60.88it/s, est. speed input: 9177.02 toks/s, output: 4785.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:49<00:05, 57.14it/s, est. speed input: 9234.82 toks/s, output: 4817.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:49<00:04, 68.97it/s, est. speed input: 9347.65 toks/s, output: 4927.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:49<00:02, 102.00it/s, est. speed input: 9633.59 toks/s, output: 5151.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:49<00:02, 91.28it/s, est. speed input: 9729.42 toks/s, output: 5262.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:49<00:02, 87.78it/s, est. speed input: 9795.78 toks/s, output: 5337.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:50<00:02, 79.73it/s, est. speed input: 9859.70 toks/s, output: 5423.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:50<00:02, 76.83it/s, est. speed input: 9953.25 toks/s, output: 5521.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:50<00:02, 85.62it/s, est. speed input: 10069.22 toks/s, output: 5626.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:50<00:02, 81.50it/s, est. speed input: 10134.98 toks/s, output: 5694.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:50<00:02, 78.78it/s, est. speed input: 10195.60 toks/s, output: 5770.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:50<00:02, 66.04it/s, est. speed input: 10242.43 toks/s, output: 5809.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:50<00:01, 81.88it/s, est. speed input: 10362.82 toks/s, output: 5918.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:51<00:01, 100.79it/s, est. speed input: 10552.24 toks/s, output: 6099.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:51<00:00, 97.32it/s, est. speed input: 10649.85 toks/s, output: 6214.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:51<00:00, 108.93it/s, est. speed input: 10799.87 toks/s, output: 6381.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:51<00:00, 101.36it/s, est. speed input: 10894.64 toks/s, output: 6505.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:51<00:00, 108.41it/s, est. speed input: 11093.99 toks/s, output: 6655.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:52<00:00, 55.92it/s, est. speed input: 11096.39 toks/s, output: 6702.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:53<00:00, 31.53it/s, est. speed input: 11004.73 toks/s, output: 6695.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:54<00:00, 20.83it/s, est. speed input: 10881.85 toks/s, output: 6688.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 9.69it/s, est. speed input: 10500.20 toks/s, output: 6489.49 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.63it/s, est. speed input: 10500.20 toks/s, output: 6489.49 toks/s]
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 465.887
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1510.0
[36m(Runner pid=3309020)[0m mean: 284.353
[36m(Runner pid=3309020)[0m min: 57.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.282
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.64
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.144
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.292
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.201
[36m(Runner pid=3309020)[0m gen: 104.978
[36m(Runner pid=3309020)[0m old: 86.217
[36m(Runner pid=3309020)[0m ref: 87.576
[36m(Runner pid=3309020)[0m reward: 6.602
[36m(Runner pid=3309020)[0m step: 846.833
[36m(Runner pid=3309020)[0m update_actor: 560.64
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 33; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:27:45 [executor_base.py:219] It took 0.342157 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:29:07 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:27:45 [executor_base.py:219] It took 0.341994 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:29:07 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:29:07 [executor_base.py:208] It took 0.326922 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:29:13 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:29:13 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:29:13 [executor_base.py:208] It took 0.325379 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.82 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00021521214512176812, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.3353554606437683, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007948753773234785}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.5687559247016907, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.2655748128890991, 'actor/pg_clipfrac': 0.0006901310989633203, 'actor/ppo_kl': -0.0002922192506957799}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0001238182740053162, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.3365727365016937, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.09676071256399155, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.3565240800380707, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00023376173339784145, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.10518162697553635, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.026864273473620415, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.14244896173477173, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005288667744025588}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.48616304993629456, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.06067449972033501, 'actor/pg_clipfrac': 0.0005614823312498629, 'actor/ppo_kl': 0.00011148940393468365}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.1428881138563156, 'actor/pg_clipfrac': 0.0004887585528194904, 'actor/ppo_kl': 0.0008235849090851843}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.14644691348075867, 'actor/pg_clipfrac': 0.0015243901871144772, 'actor/ppo_kl': 2.0463292457861826e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00024179858155548573, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00031416903948411345}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.023199278861284256, 'actor/pg_clipfrac': 0.0015302218962460756, 'actor/ppo_kl': 0.0005124857416376472}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.158786341547966, 'actor/pg_clipfrac': 0.0005988024058751762, 'actor/ppo_kl': -0.0006793918437324464}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00015781383262947202, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00038160159601829946}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.1957080215215683, 'actor/pg_clipfrac': 0.0024968788493424654, 'actor/ppo_kl': -6.784065044485033e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.24466805160045624, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002835421764757484}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.9141196608543396, 'actor/pg_clipfrac': 0.005511811003088951, 'actor/ppo_kl': -0.00035919429501518607}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0001927595294546336, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004752988170366734}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.26819056272506714, 'actor/pg_clipfrac': 0.0011627906933426857, 'actor/ppo_kl': 0.0002992851659655571}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.4569595158100128, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004277981352061033}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.9947213530540466, 'actor/pg_clipfrac': 0.0018281536176800728, 'actor/ppo_kl': -0.0011446428252384067}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5227084755897522, 'actor/pg_clipfrac': 0.0009871668880805373, 'actor/ppo_kl': 0.0017883775290101767}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0004093785828445107, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006778513197787106}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.26138201355934143, 'actor/pg_clipfrac': 0.0023952096235007048, 'actor/ppo_kl': 0.0015990663086995482}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0818730965256691, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00036706848186440766}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3746923804283142, 'actor/pg_clipfrac': 0.0009233610471710563, 'actor/ppo_kl': -0.0013662745477631688}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.2132166624069214, 'actor/pg_clipfrac': 0.0008163265301845968, 'actor/ppo_kl': 0.0008295767474919558}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.331748902797699, 'actor/pg_clipfrac': 0.000933706818614155, 'actor/ppo_kl': 0.0006398807163350284}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.22137926518917084, 'actor/pg_clipfrac': 0.0006277463980950415, 'actor/ppo_kl': 0.001242105383425951}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.45270469784736633, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001551252556964755}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.21392948925495148, 'actor/pg_clipfrac': 0.0011261261533945799, 'actor/ppo_kl': -0.0001858719770098105}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00021769241720903665, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009161470225080848}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002418467338429764, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011086964514106512}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00023854860046412796, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008271359838545322}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.25609344244003296, 'actor/pg_clipfrac': 0.002971768146380782, 'actor/ppo_kl': -0.0003884161124005914}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.255269318819046, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00046168701373972}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2904094159603119, 'actor/pg_clipfrac': 0.0013333333190530539, 'actor/ppo_kl': 0.001679150853306055}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.22163069248199463, 'actor/pg_clipfrac': 0.0011709601385518909, 'actor/ppo_kl': 0.0027809925377368927}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00020354065054561943, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008207624196074903}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00017449988808948547, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00014375371392816305}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.25996777415275574, 'actor/pg_clipfrac': 0.0006821282440796494, 'actor/ppo_kl': -0.0013389431405812502}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.4581795930862427, 'actor/pg_clipfrac': 0.001367989112623036, 'actor/ppo_kl': 4.935101605951786e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.08873090147972107, 'actor/pg_clipfrac': 0.0036764706019312143, 'actor/ppo_kl': -0.0013761555310338736}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.047341667115688324, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006438749260269105}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00016385303752031177, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.3676304661203176e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00022872175031807274, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 2.146230144717265e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0001877499307738617, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.1283775620540837e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.04336369037628174, 'actor/pg_clipfrac': 0.0015898251440376043, 'actor/ppo_kl': 0.001830314751714468}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.060853373259305954, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00022546462423633784}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.20847249031066895, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007819270249456167}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.10903974622488022, 'actor/pg_clipfrac': 0.0013106160331517458, 'actor/ppo_kl': 0.0025073259603232145}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.24943342804908752, 'actor/pg_clipfrac': 0.0008695652359165251, 'actor/ppo_kl': 0.00019056900055147707}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.17193496227264404, 'actor/pg_clipfrac': 0.0022321429569274187, 'actor/ppo_kl': -0.00031594009487889707}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.17065148055553436, 'actor/pg_clipfrac': 0.0009881423320621252, 'actor/ppo_kl': 0.001280539552681148}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.2789246439933777, 'actor/pg_clipfrac': 0.0037735849618911743, 'actor/ppo_kl': 0.0006348495953716338}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.14847776293754578, 'actor/pg_clipfrac': 0.001962708542123437, 'actor/ppo_kl': 0.0013797992141917348}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0001920600770972669, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005163569585420191}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00017525571456644684, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010804333724081516}
[36m(Runner pid=3309020)[0m Step 33
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.264
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.022
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.007
[36m(Runner pid=3309020)[0m ppo_kl: 6.181506450850805e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.006
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:32:21, 4.35s/it, est. speed input: 107.67 toks/s, output: 23.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<47:04, 2.22s/it, est. speed input: 181.33 toks/s, output: 46.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:27<30:01, 1.42s/it, est. speed input: 247.81 toks/s, output: 64.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<18:44, 1.12it/s, est. speed input: 322.54 toks/s, output: 86.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:29<14:25, 1.45it/s, est. speed input: 383.09 toks/s, output: 104.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<10:45, 1.94it/s, est. speed input: 448.06 toks/s, output: 122.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:33<10:18, 2.01it/s, est. speed input: 490.62 toks/s, output: 136.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<07:31, 2.75it/s, est. speed input: 556.25 toks/s, output: 157.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<05:39, 3.64it/s, est. speed input: 617.84 toks/s, output: 172.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<04:10, 4.91it/s, est. speed input: 681.90 toks/s, output: 190.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<03:41, 5.54it/s, est. speed input: 739.37 toks/s, output: 207.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<03:24, 5.95it/s, est. speed input: 787.60 toks/s, output: 228.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<02:05, 9.61it/s, est. speed input: 910.68 toks/s, output: 269.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:35<01:08, 17.46it/s, est. speed input: 1100.96 toks/s, output: 342.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:01, 19.20it/s, est. speed input: 1162.37 toks/s, output: 360.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:36<00:46, 25.63it/s, est. speed input: 1289.14 toks/s, output: 401.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<00:36, 32.31it/s, est. speed input: 1406.22 toks/s, output: 445.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:37<00:52, 22.18it/s, est. speed input: 1503.39 toks/s, output: 480.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<01:00, 18.97it/s, est. speed input: 1551.09 toks/s, output: 498.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:47, 23.96it/s, est. speed input: 1666.21 toks/s, output: 535.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:37<00:33, 33.50it/s, est. speed input: 1841.32 toks/s, output: 593.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:38, 29.35it/s, est. speed input: 1889.07 toks/s, output: 610.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:38<00:35, 31.45it/s, est. speed input: 1996.68 toks/s, output: 657.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:38<00:39, 27.91it/s, est. speed input: 2042.18 toks/s, output: 679.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:38<00:37, 29.59it/s, est. speed input: 2096.72 toks/s, output: 708.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:44, 24.55it/s, est. speed input: 2141.02 toks/s, output: 726.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:44, 24.34it/s, est. speed input: 2236.34 toks/s, output: 775.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:39<00:46, 23.54it/s, est. speed input: 2285.01 toks/s, output: 801.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:37, 28.57it/s, est. speed input: 2380.96 toks/s, output: 842.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<01:01, 17.50it/s, est. speed input: 2394.46 toks/s, output: 854.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:52, 20.42it/s, est. speed input: 2445.05 toks/s, output: 878.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:41<00:40, 26.20it/s, est. speed input: 2549.54 toks/s, output: 924.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:41<00:38, 26.99it/s, est. speed input: 2599.96 toks/s, output: 950.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:41<00:24, 41.57it/s, est. speed input: 2762.00 toks/s, output: 1025.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:41<00:20, 50.80it/s, est. speed input: 2868.99 toks/s, output: 1071.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:18, 54.53it/s, est. speed input: 2968.94 toks/s, output: 1114.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:17, 57.61it/s, est. speed input: 3067.11 toks/s, output: 1155.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:18, 54.70it/s, est. speed input: 3154.21 toks/s, output: 1204.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:42<00:22, 42.83it/s, est. speed input: 3241.72 toks/s, output: 1252.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:42<00:18, 51.66it/s, est. speed input: 3432.12 toks/s, output: 1344.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:13, 70.99it/s, est. speed input: 3695.10 toks/s, output: 1488.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:43<00:15, 58.98it/s, est. speed input: 3782.24 toks/s, output: 1539.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:43<00:12, 72.00it/s, est. speed input: 3981.49 toks/s, output: 1634.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:43<00:13, 68.66it/s, est. speed input: 4066.31 toks/s, output: 1693.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:43<00:11, 79.13it/s, est. speed input: 4209.39 toks/s, output: 1785.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:43<00:08, 97.70it/s, est. speed input: 4410.44 toks/s, output: 1884.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:43<00:08, 105.48it/s, est. speed input: 4597.71 toks/s, output: 1997.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:44<00:10, 79.22it/s, est. speed input: 4732.25 toks/s, output: 2086.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:44<00:10, 80.16it/s, est. speed input: 4825.13 toks/s, output: 2130.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:44<00:11, 71.07it/s, est. speed input: 4906.50 toks/s, output: 2183.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:44<00:09, 82.29it/s, est. speed input: 5100.52 toks/s, output: 2275.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:44<00:10, 75.55it/s, est. speed input: 5177.78 toks/s, output: 2326.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:45<00:08, 86.41it/s, est. speed input: 5409.30 toks/s, output: 2445.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:45<00:10, 72.24it/s, est. speed input: 5480.66 toks/s, output: 2491.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:45<00:11, 66.22it/s, est. speed input: 5558.52 toks/s, output: 2536.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:45<00:06, 101.95it/s, est. speed input: 5859.21 toks/s, output: 2705.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:45<00:06, 106.13it/s, est. speed input: 5994.75 toks/s, output: 2790.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:45<00:05, 118.67it/s, est. speed input: 6262.93 toks/s, output: 2956.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:46<00:07, 83.35it/s, est. speed input: 6366.97 toks/s, output: 3031.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:46<00:07, 87.61it/s, est. speed input: 6490.27 toks/s, output: 3113.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:46<00:05, 112.62it/s, est. speed input: 6726.32 toks/s, output: 3270.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:46<00:04, 119.99it/s, est. speed input: 6903.84 toks/s, output: 3384.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:46<00:04, 114.05it/s, est. speed input: 7033.31 toks/s, output: 3468.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:46<00:04, 116.18it/s, est. speed input: 7174.99 toks/s, output: 3556.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:47<00:06, 81.79it/s, est. speed input: 7279.39 toks/s, output: 3631.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:47<00:06, 78.74it/s, est. speed input: 7435.56 toks/s, output: 3726.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:47<00:06, 75.72it/s, est. speed input: 7513.55 toks/s, output: 3777.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:47<00:05, 95.89it/s, est. speed input: 7702.97 toks/s, output: 3904.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:48<00:05, 86.07it/s, est. speed input: 7820.91 toks/s, output: 3978.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:48<00:04, 103.41it/s, est. speed input: 7993.48 toks/s, output: 4083.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:48<00:03, 128.20it/s, est. speed input: 8213.48 toks/s, output: 4245.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:48<00:02, 138.69it/s, est. speed input: 8378.44 toks/s, output: 4332.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:48<00:02, 140.95it/s, est. speed input: 8547.12 toks/s, output: 4460.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:48<00:02, 128.41it/s, est. speed input: 8704.88 toks/s, output: 4561.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:48<00:02, 128.58it/s, est. speed input: 8909.55 toks/s, output: 4700.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:49<00:02, 125.85it/s, est. speed input: 9079.67 toks/s, output: 4836.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:49<00:02, 126.65it/s, est. speed input: 9202.42 toks/s, output: 4935.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:49<00:02, 131.62it/s, est. speed input: 9405.86 toks/s, output: 5120.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:49<00:02, 116.91it/s, est. speed input: 9515.17 toks/s, output: 5204.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:49<00:01, 132.12it/s, est. speed input: 9675.05 toks/s, output: 5345.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:49<00:02, 114.96it/s, est. speed input: 9775.78 toks/s, output: 5433.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:50<00:01, 109.20it/s, est. speed input: 9959.13 toks/s, output: 5598.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:50<00:01, 115.16it/s, est. speed input: 10083.98 toks/s, output: 5685.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:50<00:01, 129.83it/s, est. speed input: 10248.41 toks/s, output: 5829.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:50<00:01, 132.35it/s, est. speed input: 10368.26 toks/s, output: 5937.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:50<00:01, 112.97it/s, est. speed input: 10474.33 toks/s, output: 6059.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:50<00:01, 82.67it/s, est. speed input: 10552.94 toks/s, output: 6144.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:51<00:01, 87.22it/s, est. speed input: 10659.81 toks/s, output: 6250.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:51<00:01, 68.16it/s, est. speed input: 10720.89 toks/s, output: 6344.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:51<00:00, 80.75it/s, est. speed input: 10867.58 toks/s, output: 6513.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:51<00:01, 68.80it/s, est. speed input: 10911.28 toks/s, output: 6567.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:52<00:00, 57.02it/s, est. speed input: 10970.72 toks/s, output: 6660.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:52<00:00, 51.90it/s, est. speed input: 11010.68 toks/s, output: 6733.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:52<00:00, 58.25it/s, est. speed input: 11072.45 toks/s, output: 6802.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:52<00:00, 52.91it/s, est. speed input: 11108.28 toks/s, output: 6871.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:53<00:00, 43.60it/s, est. speed input: 11127.21 toks/s, output: 6927.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:53<00:00, 36.79it/s, est. speed input: 11134.26 toks/s, output: 6973.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:54<00:00, 23.75it/s, est. speed input: 11052.57 toks/s, output: 6939.31 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:54<00:00, 23.66it/s, est. speed input: 11052.57 toks/s, output: 6939.31 toks/s]
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.006
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.64
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.64
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 957794
[36m(Runner pid=3309020)[0m balanced_min: 957793
[36m(Runner pid=3309020)[0m max: 961113
[36m(Runner pid=3309020)[0m mean: 957793.5
[36m(Runner pid=3309020)[0m min: 954474
[36m(Runner pid=3309020)[0m minmax_diff: 6639
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.913
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.118
[36m(Runner pid=3309020)[0m throughput: 1141.631
[36m(Runner pid=3309020)[0m time_per_step: 838.969
[36m(Runner pid=3309020)[0m total_num_tokens: 1915587
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 463.32
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1334.0
[36m(Runner pid=3309020)[0m mean: 284.956
[36m(Runner pid=3309020)[0m min: 52.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.282
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.64
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.139
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.293
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.202
[36m(Runner pid=3309020)[0m gen: 101.434
[36m(Runner pid=3309020)[0m old: 84.099
[36m(Runner pid=3309020)[0m ref: 85.005
[36m(Runner pid=3309020)[0m reward: 5.891
[36m(Runner pid=3309020)[0m step: 838.969
[36m(Runner pid=3309020)[0m update_actor: 561.726
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 34; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.67 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:41:50 [executor_base.py:219] It took 0.339739 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.59 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:43:15 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:41:50 [executor_base.py:219] It took 0.340529 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:43:16 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:43:16 [executor_base.py:208] It took 0.325634 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:43:39 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:43:40 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:43:40 [executor_base.py:208] It took 0.328201 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.35128000378608704, 'actor/pg_clipfrac': 0.0012690355069935322, 'actor/ppo_kl': 0.0004329294024500996}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2907693684101105, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001001712735160254}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00013846892397850752, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006556713487952948}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4474738836288452, 'actor/pg_clipfrac': 0.0006257822387851775, 'actor/ppo_kl': 0.0007412639679387212}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002076846285490319, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010340552544221282}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.22326143085956573, 'actor/pg_clipfrac': 0.0006242197123356164, 'actor/ppo_kl': 0.0007811759714968503}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002260871697217226, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0001527351705590263, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000510424782987684}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3630201816558838, 'actor/pg_clipfrac': 0.002336448524147272, 'actor/ppo_kl': -0.002617902820929885}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.2768460512161255, 'actor/pg_clipfrac': 0.0021715527400374413, 'actor/ppo_kl': 0.0010201436234638095}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00018829022883437574, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00015916989650577307, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.8137348294258118, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0002615309495013207, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002183055505156517}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.022933516651391983, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.08550570160150528, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.06918991357088089, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001087907119654119}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2601296305656433, 'actor/pg_clipfrac': 0.0007782101165503263, 'actor/ppo_kl': 0.0011234446428716183}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.07008394598960876, 'actor/pg_clipfrac': 0.002794857369735837, 'actor/ppo_kl': 0.0011526505695655942}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.4472811818122864, 'actor/pg_clipfrac': 0.0049019609577953815, 'actor/ppo_kl': -0.000505302450619638}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.08646919578313828, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018584486097097397}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.32886019349098206, 'actor/pg_clipfrac': 0.0026666666381061077, 'actor/ppo_kl': -0.0009976128349080682}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.5423481464385986, 'actor/pg_clipfrac': 0.001784652005881071, 'actor/ppo_kl': -0.00117199937812984}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3750353157520294, 'actor/pg_clipfrac': 0.0015212981961667538, 'actor/ppo_kl': 0.0006971629918552935}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5944652557373047, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00064101378666237}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.3706422448158264, 'actor/pg_clipfrac': 0.0008818341884762049, 'actor/ppo_kl': 0.0012805036967620254}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00020230414520483464, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00014877997455187142}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00018425089365337044, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00032148617901839316}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.22518227994441986, 'actor/pg_clipfrac': 0.0045095826499164104, 'actor/ppo_kl': -0.0007565142586827278}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0002620555169414729, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005760985077358782}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.12912732362747192, 'actor/pg_clipfrac': 0.0008841733215376735, 'actor/ppo_kl': -0.0011946916347369552}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.2380075454711914, 'actor/pg_clipfrac': 0.0008163265301845968, 'actor/ppo_kl': 0.0009447059128433466}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.5949887037277222, 'actor/pg_clipfrac': 0.001899335184134543, 'actor/ppo_kl': 0.001642171060666442}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.35759690403938293, 'actor/pg_clipfrac': 0.0011198208667337894, 'actor/ppo_kl': -0.000682807294651866}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.4471732974052429, 'actor/pg_clipfrac': 0.0012658227933570743, 'actor/ppo_kl': 3.9474875848100055e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.3297567665576935, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00010510213178349659}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.5959818959236145, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012126002693548799}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.20316959917545319, 'actor/pg_clipfrac': 0.0018198362085968256, 'actor/ppo_kl': -0.000935241230763495}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1922195553779602, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0022619348019361496}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.09792790561914444, 'actor/pg_clipfrac': 0.001251564477570355, 'actor/ppo_kl': -0.0004219036200083792}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.08694196492433548, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002342402236536145}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.09123445302248001, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007766344933770597}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5002429485321045, 'actor/pg_clipfrac': 0.0009017132688313723, 'actor/ppo_kl': -0.0018648331752046943}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.12123759835958481, 'actor/pg_clipfrac': 0.0007867820677347481, 'actor/ppo_kl': -0.0007205621222965419}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.3305209279060364, 'actor/pg_clipfrac': 0.0010080644860863686, 'actor/ppo_kl': 0.0005473821656778455}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.4474969506263733, 'actor/pg_clipfrac': 0.002037697471678257, 'actor/ppo_kl': -0.0004500531358644366}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2012636661529541, 'actor/pg_clipfrac': 0.0007776049897074699, 'actor/ppo_kl': 0.00022166327107697725}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.17971977591514587, 'actor/pg_clipfrac': 0.0009132419945672154, 'actor/ppo_kl': -0.0018596997251734138}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.22825540602207184, 'actor/pg_clipfrac': 0.0009267840650863945, 'actor/ppo_kl': -0.0007719901041127741}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.35121414065361023, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006562964408658445}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.6065028309822083, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003980521869380027}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.06575420498847961, 'actor/pg_clipfrac': 0.0018975331913679838, 'actor/ppo_kl': -0.0016689010662958026}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00018946873024106026, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00027437476092018187}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.14829719066619873, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008761637145653367}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.2278391271829605, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00046902766916900873}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.06984826177358627, 'actor/pg_clipfrac': 0.0011312217684462667, 'actor/ppo_kl': 0.002700993325561285}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00022533077572006732, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002619321458041668}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.07851202040910721, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008850251906551421}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.05838015675544739, 'actor/pg_clipfrac': 0.001743679167702794, 'actor/ppo_kl': -0.0002531901409383863}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.35385197401046753, 'actor/pg_clipfrac': 0.001254705130122602, 'actor/ppo_kl': -0.002416998380795121}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.10146135836839676, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00018754758639261127}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:20<1:27:28, 4.12s/it, est. speed input: 100.57 toks/s, output: 23.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:21<37:22, 1.77s/it, est. speed input: 207.96 toks/s, output: 43.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<28:29, 1.35s/it, est. speed input: 257.90 toks/s, output: 58.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<21:04, 1.00s/it, est. speed input: 329.58 toks/s, output: 74.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<13:59, 1.50it/s, est. speed input: 406.55 toks/s, output: 95.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:28<09:25, 2.21it/s, est. speed input: 488.07 toks/s, output: 117.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:29<07:57, 2.61it/s, est. speed input: 545.86 toks/s, output: 135.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:30<06:14, 3.31it/s, est. speed input: 607.15 toks/s, output: 157.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<07:43, 2.66it/s, est. speed input: 621.46 toks/s, output: 167.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<06:52, 2.98it/s, est. speed input: 667.31 toks/s, output: 181.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:34<03:06, 6.53it/s, est. speed input: 866.94 toks/s, output: 242.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:34<02:37, 7.67it/s, est. speed input: 928.45 toks/s, output: 261.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:35<02:33, 7.86it/s, est. speed input: 978.01 toks/s, output: 276.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:35<02:03, 9.71it/s, est. speed input: 1040.31 toks/s, output: 297.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:35<01:05, 17.94it/s, est. speed input: 1285.65 toks/s, output: 386.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:36<01:09, 16.93it/s, est. speed input: 1334.84 toks/s, output: 403.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:36<00:54, 21.30it/s, est. speed input: 1451.67 toks/s, output: 441.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:37<01:13, 15.81it/s, est. speed input: 1487.75 toks/s, output: 457.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:47, 24.29it/s, est. speed input: 1662.39 toks/s, output: 524.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:37<00:52, 21.55it/s, est. speed input: 1704.19 toks/s, output: 546.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:37<00:48, 23.42it/s, est. speed input: 1756.28 toks/s, output: 568.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:38<00:51, 22.01it/s, est. speed input: 1798.95 toks/s, output: 590.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:38<00:42, 26.06it/s, est. speed input: 1908.06 toks/s, output: 633.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:38<00:40, 27.87it/s, est. speed input: 1960.71 toks/s, output: 654.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:38<00:41, 26.74it/s, est. speed input: 2057.13 toks/s, output: 698.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:34, 32.11it/s, est. speed input: 2168.57 toks/s, output: 749.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:39<00:26, 40.23it/s, est. speed input: 2281.74 toks/s, output: 786.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:41, 26.01it/s, est. speed input: 2313.93 toks/s, output: 802.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:39<00:29, 35.54it/s, est. speed input: 2482.76 toks/s, output: 870.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:39<00:28, 36.71it/s, est. speed input: 2531.24 toks/s, output: 893.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:25, 41.75it/s, est. speed input: 2637.42 toks/s, output: 927.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:40<00:24, 42.22it/s, est. speed input: 2736.71 toks/s, output: 972.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:40<00:24, 42.45it/s, est. speed input: 2785.36 toks/s, output: 988.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:40<00:24, 42.70it/s, est. speed input: 2833.03 toks/s, output: 1012.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:41<00:34, 29.92it/s, est. speed input: 2906.24 toks/s, output: 1041.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:36, 28.15it/s, est. speed input: 2946.90 toks/s, output: 1060.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:41<00:32, 31.14it/s, est. speed input: 2993.79 toks/s, output: 1083.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:32, 31.35it/s, est. speed input: 3036.04 toks/s, output: 1103.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:29, 34.22it/s, est. speed input: 3128.34 toks/s, output: 1151.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:41<00:29, 33.91it/s, est. speed input: 3172.62 toks/s, output: 1179.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:15, 63.94it/s, est. speed input: 3389.71 toks/s, output: 1285.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:42<00:13, 71.27it/s, est. speed input: 3492.42 toks/s, output: 1335.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:42<00:13, 70.00it/s, est. speed input: 3587.46 toks/s, output: 1394.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:13, 68.93it/s, est. speed input: 3679.01 toks/s, output: 1449.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:42<00:11, 78.49it/s, est. speed input: 3828.65 toks/s, output: 1504.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:42<00:10, 85.44it/s, est. speed input: 3970.98 toks/s, output: 1572.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:42<00:12, 73.33it/s, est. speed input: 4063.49 toks/s, output: 1623.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:15, 58.47it/s, est. speed input: 4146.04 toks/s, output: 1669.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:43<00:12, 71.01it/s, est. speed input: 4296.43 toks/s, output: 1744.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:43<00:09, 91.20it/s, est. speed input: 4503.16 toks/s, output: 1846.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:43<00:07, 107.70it/s, est. speed input: 4702.12 toks/s, output: 1952.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:43<00:07, 110.36it/s, est. speed input: 4839.44 toks/s, output: 2031.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:44<00:09, 89.35it/s, est. speed input: 4980.19 toks/s, output: 2106.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:44<00:07, 106.64it/s, est. speed input: 5169.15 toks/s, output: 2205.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:44<00:08, 96.01it/s, est. speed input: 5303.93 toks/s, output: 2295.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:44<00:07, 99.51it/s, est. speed input: 5524.73 toks/s, output: 2436.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:44<00:10, 69.49it/s, est. speed input: 5629.60 toks/s, output: 2513.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:45<00:09, 77.06it/s, est. speed input: 5770.66 toks/s, output: 2586.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:45<00:11, 63.69it/s, est. speed input: 5840.57 toks/s, output: 2636.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:45<00:08, 80.85it/s, est. speed input: 6072.57 toks/s, output: 2768.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:45<00:06, 93.95it/s, est. speed input: 6332.35 toks/s, output: 2932.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:45<00:06, 99.57it/s, est. speed input: 6462.89 toks/s, output: 3006.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:46<00:09, 65.80it/s, est. speed input: 6550.38 toks/s, output: 3051.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:46<00:08, 69.13it/s, est. speed input: 6631.99 toks/s, output: 3119.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:46<00:08, 67.78it/s, est. speed input: 6708.50 toks/s, output: 3166.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:46<00:05, 112.03it/s, est. speed input: 7029.83 toks/s, output: 3393.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:47<00:05, 98.21it/s, est. speed input: 7143.59 toks/s, output: 3485.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:47<00:03, 141.27it/s, est. speed input: 7468.55 toks/s, output: 3710.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:47<00:05, 96.66it/s, est. speed input: 7601.60 toks/s, output: 3783.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:47<00:06, 77.58it/s, est. speed input: 7696.38 toks/s, output: 3858.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:47<00:05, 85.55it/s, est. speed input: 7822.79 toks/s, output: 3951.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:48<00:04, 101.53it/s, est. speed input: 8070.57 toks/s, output: 4102.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:48<00:04, 94.76it/s, est. speed input: 8184.91 toks/s, output: 4202.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:48<00:04, 96.05it/s, est. speed input: 8304.47 toks/s, output: 4273.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:48<00:03, 100.39it/s, est. speed input: 8423.40 toks/s, output: 4369.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:48<00:03, 109.45it/s, est. speed input: 8549.62 toks/s, output: 4463.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:48<00:02, 126.64it/s, est. speed input: 8722.93 toks/s, output: 4581.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:48<00:02, 143.41it/s, est. speed input: 8893.09 toks/s, output: 4704.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:49<00:02, 117.21it/s, est. speed input: 9037.77 toks/s, output: 4815.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:49<00:01, 148.68it/s, est. speed input: 9297.91 toks/s, output: 5040.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:49<00:01, 168.62it/s, est. speed input: 9504.37 toks/s, output: 5237.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:49<00:01, 155.37it/s, est. speed input: 9663.16 toks/s, output: 5366.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:49<00:02, 107.63it/s, est. speed input: 9792.48 toks/s, output: 5471.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:50<00:01, 112.20it/s, est. speed input: 9905.64 toks/s, output: 5578.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:50<00:01, 110.64it/s, est. speed input: 10018.95 toks/s, output: 5663.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:50<00:01, 96.82it/s, est. speed input: 10116.99 toks/s, output: 5739.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:50<00:01, 90.27it/s, est. speed input: 10216.23 toks/s, output: 5840.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:50<00:01, 109.36it/s, est. speed input: 10497.37 toks/s, output: 6098.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:51<00:01, 89.69it/s, est. speed input: 10580.63 toks/s, output: 6167.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:51<00:01, 94.92it/s, est. speed input: 10688.60 toks/s, output: 6283.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:51<00:01, 67.41it/s, est. speed input: 10737.44 toks/s, output: 6348.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:51<00:00, 74.61it/s, est. speed input: 10851.12 toks/s, output: 6446.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:51<00:00, 77.16it/s, est. speed input: 10919.08 toks/s, output: 6521.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:52<00:00, 78.13it/s, est. speed input: 11015.28 toks/s, output: 6628.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:52<00:00, 60.79it/s, est. speed input: 11048.33 toks/s, output: 6691.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 37.37it/s, est. speed input: 11012.82 toks/s, output: 6698.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:53<00:00, 41.00it/s, est. speed input: 11063.36 toks/s, output: 6777.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 8.96it/s, est. speed input: 10463.08 toks/s, output: 6453.38 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.61it/s, est. speed input: 10463.08 toks/s, output: 6453.38 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0001453783770557493, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006444209138862789}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.18283489346504211, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001604237244464457}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3130454123020172, 'actor/pg_clipfrac': 0.0009823183063417673, 'actor/ppo_kl': -0.0006927835056558251}
[36m(Runner pid=3309020)[0m Step 34
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.308
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.03
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: -6.597249024054009e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.016
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.016
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.634
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.634
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 973781
[36m(Runner pid=3309020)[0m balanced_min: 973781
[36m(Runner pid=3309020)[0m max: 974354
[36m(Runner pid=3309020)[0m mean: 973781.0
[36m(Runner pid=3309020)[0m min: 973208
[36m(Runner pid=3309020)[0m minmax_diff: 1146
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.42
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.119
[36m(Runner pid=3309020)[0m throughput: 1118.935
[36m(Runner pid=3309020)[0m time_per_step: 870.275
[36m(Runner pid=3309020)[0m total_num_tokens: 1947562
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 467.428
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1090.0
[36m(Runner pid=3309020)[0m mean: 293.339
[36m(Runner pid=3309020)[0m min: 51.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.27
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.634
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.220560013847386e-05
[36m(Runner pid=3309020)[0m gen: 0.166
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.29
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.18
[36m(Runner pid=3309020)[0m gen: 124.515
[36m(Runner pid=3309020)[0m old: 86.719
[36m(Runner pid=3309020)[0m ref: 88.079
[36m(Runner pid=3309020)[0m reward: 5.986
[36m(Runner pid=3309020)[0m step: 870.275
[36m(Runner pid=3309020)[0m update_actor: 564.197
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 35; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.66 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:56:21 [executor_base.py:219] It took 0.338274 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.58 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:57:48 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:56:21 [executor_base.py:219] It took 0.340973 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:57:48 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 05:57:48 [executor_base.py:208] It took 0.327679 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.71 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.79 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:57:49 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:57:49 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.79 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 05:57:49 [executor_base.py:208] It took 0.325471 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.44717738032341003, 'actor/pg_clipfrac': 0.0024449878837913275, 'actor/ppo_kl': -8.342901128344238e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.3541848063468933, 'actor/pg_clipfrac': 0.0006877579144202173, 'actor/ppo_kl': 0.00017627365014050156}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002773744927253574, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.2697312533855438, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00016126422269735485, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006865064497105777}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0001811468682717532, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.1316218078136444, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.38421130180358887, 'actor/pg_clipfrac': 0.004694835748523474, 'actor/ppo_kl': -0.0009090485982596874}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1301545947790146, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00021899168496020138, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.07606388628482819, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.5614978075027466, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00014884780102875084, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.027002327144145966, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.04999379441142082, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00019379753211978823, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007967291167005897}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.42450395226478577, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014926427975296974}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.34640297293663025, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004463400109671056}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.04355577006936073, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000668558175675571}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.18822544813156128, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008520164992660284}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00017572635260876268, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010068799601867795}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00021546361676882952, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -5.041364056523889e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2584691047668457, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005533838993869722}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.29699814319610596, 'actor/pg_clipfrac': 0.0015923567116260529, 'actor/ppo_kl': 0.0002441021497361362}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.000174586137291044, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006397916004061699}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.000174271161085926, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006620772182941437}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.11684782803058624, 'actor/pg_clipfrac': 0.0006329113966785371, 'actor/ppo_kl': -0.00036875688238069415}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.4233170747756958, 'actor/pg_clipfrac': 0.0016155089251697063, 'actor/ppo_kl': -0.002952051814645529}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.1724630743265152, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.003958359360694885}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0002972520305775106, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016101283254101872}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.2593408524990082, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0021701413206756115}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00034041868639178574, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009778024395927787}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.28980034589767456, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001932275015860796}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.21886658668518066, 'actor/pg_clipfrac': 0.0008474576170556247, 'actor/ppo_kl': 0.00031801481964066625}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0001687428157310933, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00037907148362137377}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.49077093601226807, 'actor/pg_clipfrac': 0.0014204545877873898, 'actor/ppo_kl': 0.0009621652425266802}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00015181103663053364, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -4.197426824248396e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00034135073656216264, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009028555359691381}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.19287382066249847, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001072897925041616}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.31890642642974854, 'actor/pg_clipfrac': 0.003574620233848691, 'actor/ppo_kl': -0.0011770522687584162}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00023594930826220661, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 2.688044151000213e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002008914016187191, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008899201056919992}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00017138858675025403, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00024533664691261947}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00026863545645028353, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.332549673970789e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.3982313573360443, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00019250484183430672}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.15231844782829285, 'actor/pg_clipfrac': 0.0007633587811142206, 'actor/ppo_kl': -0.00059175567002967}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.32526466250419617, 'actor/pg_clipfrac': 0.003375527448952198, 'actor/ppo_kl': -0.00042340403888374567}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.28865692019462585, 'actor/pg_clipfrac': 0.00324324332177639, 'actor/ppo_kl': 0.0015410984633490443}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.06948605924844742, 'actor/pg_clipfrac': 0.0010416667209938169, 'actor/ppo_kl': -0.0005484739667735994}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.0039231437258422375, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014077952364459634}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.034161828458309174, 'actor/pg_clipfrac': 0.0058823530562222, 'actor/ppo_kl': 0.0017528337193652987}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.06387436389923096, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00042829031008295715}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00027875410160049796, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00018190177797805518}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.2466459423303604, 'actor/pg_clipfrac': 0.0007235889788717031, 'actor/ppo_kl': 0.0014129133196547627}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.05339459702372551, 'actor/pg_clipfrac': 0.002752293599769473, 'actor/ppo_kl': 0.0019839839078485966}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:36:54, 15.46s/it, est. speed input: 30.59 toks/s, output: 6.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:15<25:17, 4.06s/it, est. speed input: 89.32 toks/s, output: 17.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%|▏ | 5/377 [00:15<12:35, 2.03s/it, est. speed input: 145.02 toks/s, output: 29.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 7/377 [00:16<07:41, 1.25s/it, est. speed input: 199.84 toks/s, output: 42.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 12/377 [00:16<03:08, 1.94it/s, est. speed input: 338.87 toks/s, output: 77.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 15/377 [00:16<02:08, 2.81it/s, est. speed input: 419.97 toks/s, output: 98.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 17/377 [00:16<01:42, 3.52it/s, est. speed input: 471.60 toks/s, output: 112.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 20/377 [00:16<01:15, 4.71it/s, est. speed input: 550.40 toks/s, output: 132.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 26/377 [00:16<00:41, 8.49it/s, est. speed input: 709.69 toks/s, output: 176.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 30/377 [00:17<00:33, 10.50it/s, est. speed input: 810.61 toks/s, output: 207.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 38/377 [00:17<00:19, 17.46it/s, est. speed input: 1018.56 toks/s, output: 272.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 42/377 [00:17<00:17, 18.98it/s, est. speed input: 1117.12 toks/s, output: 302.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 46/377 [00:17<00:15, 21.32it/s, est. speed input: 1214.39 toks/s, output: 334.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 56/377 [00:17<00:10, 31.69it/s, est. speed input: 1467.07 toks/s, output: 419.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 61/377 [00:17<00:09, 33.69it/s, est. speed input: 1585.26 toks/s, output: 463.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 68/377 [00:17<00:08, 37.25it/s, est. speed input: 1753.07 toks/s, output: 524.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 77/377 [00:18<00:06, 46.32it/s, est. speed input: 1973.34 toks/s, output: 606.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 83/377 [00:18<00:06, 44.69it/s, est. speed input: 2109.32 toks/s, output: 659.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▎ | 89/377 [00:18<00:06, 46.38it/s, est. speed input: 2246.54 toks/s, output: 713.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 95/377 [00:18<00:06, 42.51it/s, est. speed input: 2375.96 toks/s, output: 765.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 101/377 [00:18<00:06, 45.05it/s, est. speed input: 2510.13 toks/s, output: 821.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 106/377 [00:18<00:06, 39.84it/s, est. speed input: 2608.98 toks/s, output: 865.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 116/377 [00:18<00:05, 51.93it/s, est. speed input: 2839.27 toks/s, output: 966.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 122/377 [00:19<00:05, 45.09it/s, est. speed input: 2958.93 toks/s, output: 1020.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 128/377 [00:19<00:05, 46.40it/s, est. speed input: 3087.19 toks/s, output: 1079.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 137/377 [00:19<00:04, 54.33it/s, est. speed input: 3285.82 toks/s, output: 1172.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 143/377 [00:19<00:04, 53.56it/s, est. speed input: 3411.28 toks/s, output: 1233.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 150/377 [00:19<00:04, 55.36it/s, est. speed input: 3554.47 toks/s, output: 1306.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 156/377 [00:19<00:04, 51.51it/s, est. speed input: 3670.97 toks/s, output: 1367.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 163/377 [00:19<00:03, 54.45it/s, est. speed input: 3816.78 toks/s, output: 1443.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 171/377 [00:19<00:03, 59.34it/s, est. speed input: 3980.60 toks/s, output: 1530.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 180/377 [00:20<00:03, 63.23it/s, est. speed input: 4174.06 toks/s, output: 1629.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 187/377 [00:20<00:03, 58.63it/s, est. speed input: 4309.07 toks/s, output: 1704.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 197/377 [00:20<00:02, 65.80it/s, est. speed input: 4517.36 toks/s, output: 1820.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 206/377 [00:20<00:02, 69.01it/s, est. speed input: 4703.25 toks/s, output: 1925.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 213/377 [00:20<00:02, 66.54it/s, est. speed input: 4837.82 toks/s, output: 2006.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 221/377 [00:20<00:02, 66.96it/s, est. speed input: 4993.98 toks/s, output: 2102.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 228/377 [00:20<00:02, 62.08it/s, est. speed input: 5125.09 toks/s, output: 2183.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 236/377 [00:20<00:02, 64.74it/s, est. speed input: 5275.69 toks/s, output: 2282.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 246/377 [00:21<00:01, 68.72it/s, est. speed input: 5474.15 toks/s, output: 2408.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 253/377 [00:21<00:01, 65.88it/s, est. speed input: 5604.00 toks/s, output: 2494.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 260/377 [00:21<00:01, 65.20it/s, est. speed input: 5730.04 toks/s, output: 2583.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 267/377 [00:21<00:01, 65.27it/s, est. speed input: 5861.37 toks/s, output: 2673.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 274/377 [00:21<00:01, 64.24it/s, est. speed input: 5984.55 toks/s, output: 2765.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 281/377 [00:21<00:01, 59.94it/s, est. speed input: 6100.67 toks/s, output: 2856.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▋ | 288/377 [00:21<00:01, 59.05it/s, est. speed input: 6226.96 toks/s, output: 2950.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 296/377 [00:21<00:01, 63.65it/s, est. speed input: 6371.64 toks/s, output: 3064.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 304/377 [00:21<00:01, 64.91it/s, est. speed input: 6507.82 toks/s, output: 3178.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 312/377 [00:22<00:00, 68.88it/s, est. speed input: 6649.73 toks/s, output: 3296.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 319/377 [00:22<00:01, 43.87it/s, est. speed input: 6707.50 toks/s, output: 3369.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 325/377 [00:22<00:01, 46.97it/s, est. speed input: 6803.86 toks/s, output: 3462.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 331/377 [00:22<00:01, 42.26it/s, est. speed input: 6879.40 toks/s, output: 3544.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 336/377 [00:22<00:01, 33.49it/s, est. speed input: 6909.98 toks/s, output: 3601.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 341/377 [00:23<00:01, 33.79it/s, est. speed input: 6971.79 toks/s, output: 3677.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:23<00:01, 29.84it/s, est. speed input: 6995.92 toks/s, output: 3730.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 350/377 [00:23<00:00, 31.06it/s, est. speed input: 7056.74 toks/s, output: 3811.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 354/377 [00:23<00:00, 23.32it/s, est. speed input: 7049.04 toks/s, output: 3849.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 357/377 [00:23<00:00, 20.52it/s, est. speed input: 7046.17 toks/s, output: 3883.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 360/377 [00:24<00:01, 13.19it/s, est. speed input: 6960.99 toks/s, output: 3875.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 362/377 [00:24<00:01, 13.24it/s, est. speed input: 6957.35 toks/s, output: 3902.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 364/377 [00:24<00:01, 12.30it/s, est. speed input: 6936.53 toks/s, output: 3921.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [00:24<00:00, 12.61it/s, est. speed input: 6935.26 toks/s, output: 3951.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 369/377 [00:25<00:00, 11.14it/s, est. speed input: 6901.02 toks/s, output: 3979.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:25<00:00, 10.08it/s, est. speed input: 6868.29 toks/s, output: 3996.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:25<00:00, 8.18it/s, est. speed input: 6807.20 toks/s, output: 3997.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [00:26<00:00, 5.94it/s, est. speed input: 6715.64 toks/s, output: 3964.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 375/377 [01:03<00:13, 6.75s/it, est. speed input: 2807.01 toks/s, output: 1739.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 377/377 [01:03<00:00, 5.96it/s, est. speed input: 2817.96 toks/s, output: 1910.94 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.28672733902931213, 'actor/pg_clipfrac': 0.0007745933253318071, 'actor/ppo_kl': -9.51310430536978e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.20929181575775146, 'actor/pg_clipfrac': 0.000586166454013437, 'actor/ppo_kl': -0.0015220037894323468}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00019660194811876863, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00010809402738232166}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.39862167835235596, 'actor/pg_clipfrac': 0.001308900536969304, 'actor/ppo_kl': 0.00024227322137448937}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.3027181625366211, 'actor/pg_clipfrac': 0.0008496176451444626, 'actor/ppo_kl': 0.000864161818753928}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.38926488161087036, 'actor/pg_clipfrac': 0.002722322940826416, 'actor/ppo_kl': -0.00029715176788158715}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00033103968598879874, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0020950993057340384}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.24373939633369446, 'actor/pg_clipfrac': 0.0031545741949230433, 'actor/ppo_kl': -0.0013377523282542825}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.44755277037620544, 'actor/pg_clipfrac': 0.0008319467306137085, 'actor/ppo_kl': 0.0008019845699891448}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:11:02 [executor_base.py:219] It took 0.338782 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:12:51 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:11:02 [executor_base.py:219] It took 0.340292 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:12:52 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:12:52 [executor_base.py:208] It took 0.329862 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:12:52 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:12:52 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:12:52 [executor_base.py:208] It took 0.325207 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to consider the properties of the triangle and the given conditions. Since D is the midpoint of AB and l is a vertical line passing through D, the point E that minimizes the perimeter of triangle AEC will be the point on l such that AE + EC is minimized. This occurs when E is the point where the perpendicular from A to l intersects l.\n\nGiven that D is the midpoint of AB, AD = DB = 2.0. The perimeter of triangle ABC is 16.0, so AB + BC + AC = 16.0. Since D is the midpoint, AD = DB = 2.0, so AB = 4.0. Therefore, BC + AC = 12.0.\n\nNow, since E is the foot of the perpendicular from A to l, AE is the height from A to l, which is the same as the altitude from A to BC. To minimize the perimeter, E should be such that AE + EC is minimized, which means E should be directly below A on line l. This makes AE the shortest distance from A to l, and EC is the same as AC when E is at its minimum distance, which is the same as AC.\n\nTherefore, the minimum perimeter of triangle AEC is AC + EC, which is the same as AC + AC = 2 * AC. Since BC + AC = 12.0, and considering the symmetry and the fact that E is directly below A, AC will be half of BC + AC, which is 6.0.\n\nSo, the minimum perimeter of triangle AEC is 2 * 6.0 = 12.0.\n\n\n12.0\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nTo find the size of angle AC'B', we need to analyze the given information and use properties of rotations and right triangles.\n\nGiven:\n- Triangle ABC is a right triangle with angle BAC = 90.0 degrees.\n- Triangle ABC is rotated clockwise around point A by 90.0 degrees to obtain triangle AB'C'.\n- Angle CC'B' = 32.0 degrees.\n\nSince the rotation is 90.0 degrees clockwise, triangle AB'C' is congruent to triangle ABC, and angle BAC = 90.0 degrees is preserved. This means that angle B'AC' = 90.0 degrees.\n\nNow, consider the triangle C'CC'. Since angle CC'B' = 32.0 degrees and angle B'AC' = 90.0 degrees, we can find angle AC'B' by considering the sum of angles in triangle AC'B'. The sum of angles in any triangle is 180.0 degrees.\n\nTherefore, angle AC'B' = 180.0 - 90.0 - 32.0 = 58.0 degrees.\n\nThus, the size of angle AC'B' is 58.0 degrees.\n\n\nThe size of angle AC'B' is 58.0 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo solve this problem, we need to understand the properties of a parallelogram and a circle. In a parallelogram, opposite angles are equal, and consecutive angles are supplementary (they add up to 180 degrees). Given that angle ADC = 54 degrees, we can find angle ABC by subtracting 54 from 180, since ABCD is a parallelogram and consecutive angles in a parallelogram are supplementary.\n\nAngle ABC = 180 - 54 = 126 degrees.\n\nNow, since angle ABC is an inscribed angle that intercepts arc AE, the measure of the arc AE (which is the same as the measure of angle AEB because they subtend the same arc) is twice the measure of the inscribed angle. Therefore, angle AEB = 2 * 54 = 108 degrees.\n\nSo, the degree of angle AEB is 108 degrees.\n\n\nThe degree of angle AEB is 108 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AB is the diameter of circle O, and CD is tangent to circle O at point D, we can use the properties of tangents and circles to find the measure of angle C. Since AB is the diameter, angle ADB is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Therefore, angle ADB = 90 degrees.\n\nGiven that angle A = 35 degrees, we can find angle DBC by subtracting angle A from angle ADB:\nangle DBC = angle ADB - angle A = 90 degrees - 35 degrees = 55 degrees.\n\nSince CD is tangent to circle O at point D, angle ODC is a right angle (90 degrees). Therefore, angle C can be found by subtracting angle DBC from 90 degrees:\nangle C = 90 degrees - angle DBC = 90 degrees - 55 degrees = 35 degrees.\n\nSo, angle C is 35 degrees.\n\n\n35 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, angle ABD is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Since EA is a tangent to the circle at point A, angle EAD is also a right angle (90 degrees).\n\nGiven that angle EAC = 120 degrees, we can find angle BAC by subtracting angle EAD from angle EAC:\nangle BAC = angle EAC - angle EAD = 120 degrees - 90 degrees = 30 degrees.\n\nSince angle ABD is a right angle (90 degrees) and angle BAC is 30 degrees, we can use the fact that the sum of angles in a triangle is 180 degrees to find angle ABC:\nangle ABC = 180 degrees - angle ABD - angle BAC = 180 degrees - 90 degrees - 30 degrees = 60 degrees.\n\nTherefore, the degree of angle ABC is 60 degrees.\n\n\n60\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_20
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_35/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_35/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_35/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 35
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.243
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.018
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.009
[36m(Runner pid=3309020)[0m ppo_kl: 6.470203328703405e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.661
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.661
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 957591
[36m(Runner pid=3309020)[0m balanced_min: 957591
[36m(Runner pid=3309020)[0m max: 957657
[36m(Runner pid=3309020)[0m mean: 957591.0
[36m(Runner pid=3309020)[0m min: 957525
[36m(Runner pid=3309020)[0m minmax_diff: 132
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.877
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.118
[36m(Runner pid=3309020)[0m throughput: 920.223
[36m(Runner pid=3309020)[0m time_per_step: 1040.607
[36m(Runner pid=3309020)[0m total_num_tokens: 1915182
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 687.0
[36m(Runner pid=3309020)[0m mean: 463.797
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1322.0
[36m(Runner pid=3309020)[0m mean: 284.321
[36m(Runner pid=3309020)[0m min: 44.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.324
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.661
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:27<1:56:48, 5.50s/it, est. speed input: 84.41 toks/s, output: 22.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:28<49:59, 2.36s/it, est. speed input: 168.94 toks/s, output: 41.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:29<29:24, 1.40s/it, est. speed input: 239.81 toks/s, output: 59.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:31<21:19, 1.02s/it, est. speed input: 291.18 toks/s, output: 74.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<13:54, 1.50it/s, est. speed input: 363.26 toks/s, output: 96.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:32<10:04, 2.07it/s, est. speed input: 423.68 toks/s, output: 116.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:33<07:43, 2.69it/s, est. speed input: 481.25 toks/s, output: 137.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<04:25, 4.65it/s, est. speed input: 609.06 toks/s, output: 180.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<03:31, 5.83it/s, est. speed input: 671.42 toks/s, output: 200.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:34<01:45, 11.48it/s, est. speed input: 871.01 toks/s, output: 267.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:34<01:19, 15.22it/s, est. speed input: 998.42 toks/s, output: 309.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:35<01:29, 13.33it/s, est. speed input: 1100.05 toks/s, output: 352.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:22, 14.41it/s, est. speed input: 1157.36 toks/s, output: 370.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:35<01:05, 18.05it/s, est. speed input: 1284.76 toks/s, output: 415.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:35<00:58, 20.00it/s, est. speed input: 1341.80 toks/s, output: 440.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<01:03, 18.32it/s, est. speed input: 1392.74 toks/s, output: 461.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:37<01:31, 12.80it/s, est. speed input: 1427.72 toks/s, output: 473.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<01:03, 18.31it/s, est. speed input: 1547.51 toks/s, output: 514.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:53, 21.33it/s, est. speed input: 1653.09 toks/s, output: 556.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:37<00:33, 33.69it/s, est. speed input: 1894.20 toks/s, output: 649.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:38<00:32, 34.30it/s, est. speed input: 1942.72 toks/s, output: 665.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:38<00:36, 30.44it/s, est. speed input: 1989.56 toks/s, output: 685.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:38<00:26, 42.12it/s, est. speed input: 2164.29 toks/s, output: 759.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:38<00:26, 42.08it/s, est. speed input: 2215.77 toks/s, output: 780.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:38<00:25, 42.19it/s, est. speed input: 2264.09 toks/s, output: 804.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:38<00:17, 60.82it/s, est. speed input: 2432.88 toks/s, output: 877.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:39<00:16, 62.40it/s, est. speed input: 2593.25 toks/s, output: 949.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:39<00:15, 67.79it/s, est. speed input: 2703.29 toks/s, output: 994.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:39<00:20, 49.89it/s, est. speed input: 2798.27 toks/s, output: 1039.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:39<00:22, 44.84it/s, est. speed input: 2899.92 toks/s, output: 1087.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:39<00:16, 60.83it/s, est. speed input: 3065.89 toks/s, output: 1148.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:39<00:14, 67.48it/s, est. speed input: 3177.23 toks/s, output: 1201.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:40<00:16, 61.07it/s, est. speed input: 3279.47 toks/s, output: 1249.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:40<00:18, 52.92it/s, est. speed input: 3370.70 toks/s, output: 1303.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:40<00:13, 72.30it/s, est. speed input: 3580.83 toks/s, output: 1407.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:40<00:09, 96.57it/s, est. speed input: 3850.97 toks/s, output: 1522.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:41<00:13, 67.72it/s, est. speed input: 3999.52 toks/s, output: 1581.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:41<00:13, 67.74it/s, est. speed input: 4100.03 toks/s, output: 1624.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:41<00:18, 49.04it/s, est. speed input: 4170.34 toks/s, output: 1648.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:41<00:18, 49.69it/s, est. speed input: 4257.39 toks/s, output: 1698.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:42<00:20, 43.91it/s, est. speed input: 4376.42 toks/s, output: 1748.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:42<00:19, 44.48it/s, est. speed input: 4465.69 toks/s, output: 1801.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:42<00:11, 72.22it/s, est. speed input: 4775.18 toks/s, output: 1954.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:42<00:11, 71.95it/s, est. speed input: 4907.94 toks/s, output: 2027.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:43<00:13, 62.11it/s, est. speed input: 4984.84 toks/s, output: 2069.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:43<00:12, 65.74it/s, est. speed input: 5081.66 toks/s, output: 2113.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:43<00:09, 80.80it/s, est. speed input: 5271.07 toks/s, output: 2206.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:43<00:08, 92.63it/s, est. speed input: 5463.80 toks/s, output: 2318.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:43<00:06, 118.63it/s, est. speed input: 5704.60 toks/s, output: 2457.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:43<00:06, 103.07it/s, est. speed input: 5872.24 toks/s, output: 2549.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:44<00:06, 113.17it/s, est. speed input: 6070.99 toks/s, output: 2674.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:44<00:06, 100.17it/s, est. speed input: 6197.14 toks/s, output: 2748.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:44<00:06, 106.53it/s, est. speed input: 6389.12 toks/s, output: 2858.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:44<00:07, 88.36it/s, est. speed input: 6510.47 toks/s, output: 2936.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:44<00:08, 78.76it/s, est. speed input: 6627.16 toks/s, output: 3006.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:45<00:07, 79.48it/s, est. speed input: 6714.71 toks/s, output: 3060.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:45<00:07, 76.89it/s, est. speed input: 6843.49 toks/s, output: 3147.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:45<00:07, 83.23it/s, est. speed input: 6979.22 toks/s, output: 3241.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:45<00:05, 112.04it/s, est. speed input: 7218.36 toks/s, output: 3383.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:45<00:05, 110.41it/s, est. speed input: 7346.28 toks/s, output: 3459.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:45<00:06, 83.85it/s, est. speed input: 7454.63 toks/s, output: 3528.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:46<00:06, 84.06it/s, est. speed input: 7536.46 toks/s, output: 3575.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:46<00:06, 81.56it/s, est. speed input: 7621.41 toks/s, output: 3633.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:46<00:07, 68.30it/s, est. speed input: 7685.62 toks/s, output: 3682.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:46<00:06, 80.25it/s, est. speed input: 7854.71 toks/s, output: 3796.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:46<00:05, 88.31it/s, est. speed input: 8025.53 toks/s, output: 3897.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:46<00:03, 118.28it/s, est. speed input: 8255.83 toks/s, output: 4046.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:47<00:05, 83.36it/s, est. speed input: 8343.21 toks/s, output: 4128.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:47<00:03, 101.95it/s, est. speed input: 8567.92 toks/s, output: 4320.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:47<00:03, 108.30it/s, est. speed input: 8686.20 toks/s, output: 4385.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:47<00:02, 129.52it/s, est. speed input: 8953.65 toks/s, output: 4566.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:47<00:02, 119.48it/s, est. speed input: 9066.15 toks/s, output: 4648.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:48<00:03, 104.44it/s, est. speed input: 9189.49 toks/s, output: 4760.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:48<00:03, 104.53it/s, est. speed input: 9307.00 toks/s, output: 4830.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:48<00:02, 131.27it/s, est. speed input: 9529.14 toks/s, output: 4993.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:48<00:02, 128.90it/s, est. speed input: 9648.89 toks/s, output: 5079.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:48<00:01, 138.48it/s, est. speed input: 9815.43 toks/s, output: 5187.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:48<00:01, 120.00it/s, est. speed input: 9918.44 toks/s, output: 5270.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:48<00:02, 93.70it/s, est. speed input: 10011.36 toks/s, output: 5349.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:49<00:02, 86.34it/s, est. speed input: 10108.68 toks/s, output: 5457.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:49<00:02, 96.77it/s, est. speed input: 10225.86 toks/s, output: 5579.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:49<00:01, 108.60it/s, est. speed input: 10380.66 toks/s, output: 5709.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:49<00:01, 135.77it/s, est. speed input: 10661.25 toks/s, output: 5955.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:49<00:00, 135.60it/s, est. speed input: 10775.24 toks/s, output: 6044.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:49<00:01, 103.94it/s, est. speed input: 10868.75 toks/s, output: 6135.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:50<00:01, 85.38it/s, est. speed input: 10946.51 toks/s, output: 6256.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:50<00:01, 62.79it/s, est. speed input: 10961.26 toks/s, output: 6307.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:50<00:01, 66.05it/s, est. speed input: 11059.73 toks/s, output: 6441.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:51<00:01, 55.26it/s, est. speed input: 11094.07 toks/s, output: 6484.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:51<00:01, 53.90it/s, est. speed input: 11140.67 toks/s, output: 6556.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:51<00:01, 36.70it/s, est. speed input: 11117.85 toks/s, output: 6577.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:52<00:01, 32.46it/s, est. speed input: 11104.74 toks/s, output: 6599.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:52<00:01, 34.31it/s, est. speed input: 11124.55 toks/s, output: 6628.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:52<00:01, 21.45it/s, est. speed input: 11057.22 toks/s, output: 6615.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:52<00:01, 20.84it/s, est. speed input: 11047.30 toks/s, output: 6651.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:54<00:01, 10.83it/s, est. speed input: 10851.60 toks/s, output: 6555.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:01, 13.43it/s, est. speed input: 10869.80 toks/s, output: 6587.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:54<00:00, 16.30it/s, est. speed input: 10886.99 toks/s, output: 6619.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:55<00:00, 11.29it/s, est. speed input: 10774.65 toks/s, output: 6582.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:04<00:00, 1.67it/s, est. speed input: 9232.43 toks/s, output: 5679.65 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:04<00:00, 19.82it/s, est. speed input: 9232.43 toks/s, output: 5679.65 toks/s]
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.14
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.294
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.278
[36m(Runner pid=3309020)[0m gen: 102.023
[36m(Runner pid=3309020)[0m old: 85.576
[36m(Runner pid=3309020)[0m ref: 86.185
[36m(Runner pid=3309020)[0m reward: 6.356
[36m(Runner pid=3309020)[0m save_checkpoint: 30.584
[36m(Runner pid=3309020)[0m step: 1040.607
[36m(Runner pid=3309020)[0m update_actor: 562.361
[36m(Runner pid=3309020)[0m validation: 166.575
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.314
[36m(Runner pid=3309020)[0m format_reward: 0.991
[36m(Runner pid=3309020)[0m overall_reward: 0.653
[36m(Runner pid=3309020)[0m reward_score: 0.653
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.993
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_35/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_35/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_35/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 36; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:13:43 [executor_base.py:219] It took 0.344079 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:15:18 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:13:43 [executor_base.py:219] It took 0.345343 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:15:18 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:15:18 [executor_base.py:208] It took 0.326723 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:15:35 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:15:35 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:15:35 [executor_base.py:208] It took 0.326172 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00013869738904759288, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004293749516364187}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.44339776039123535, 'actor/pg_clipfrac': 0.0013175230706110597, 'actor/ppo_kl': -0.0025383669417351484}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.22486047446727753, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0001549907901789993, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00018723518587648869, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -6.4376781665487215e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.09223049879074097, 'actor/pg_clipfrac': 0.0005431830650195479, 'actor/ppo_kl': 0.0006437814445234835}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.10917899012565613, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00025080880732275546, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.000287851580651477, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00017834622121881694, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3551289141178131, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.005777654703706503, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012345339637249708}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.18117105960845947, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.2370728701353073, 'actor/pg_clipfrac': 0.0015797788510099053, 'actor/ppo_kl': 0.0013951878063380718}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.20061957836151123, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00142749457154423}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4107306897640228, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.08000647276639938, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00038588183815591037}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00026818091282621026, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014801564393565059}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00016837492876220495, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001918169786222279}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4763318598270416, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016680177068337798}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3038838505744934, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014329535188153386}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.4116857051849365, 'actor/pg_clipfrac': 0.0018034265376627445, 'actor/ppo_kl': -0.0009907721541821957}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.21625299751758575, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009343783021904528}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.4062153697013855, 'actor/pg_clipfrac': 0.0008467400330118835, 'actor/ppo_kl': 3.5239920634921873e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 1.0402772426605225, 'actor/pg_clipfrac': 0.0002821670495904982, 'actor/ppo_kl': -4.628495662473142e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.11777059733867645, 'actor/pg_clipfrac': 0.0008285004296340048, 'actor/ppo_kl': 0.0020160390995442867}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5278281569480896, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014174416428431869}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2596403956413269, 'actor/pg_clipfrac': 0.00206611561588943, 'actor/ppo_kl': -0.0021630357950925827}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.1772201657295227, 'actor/pg_clipfrac': 0.0013486177194863558, 'actor/ppo_kl': 0.0007793030235916376}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.030559703707695007, 'actor/pg_clipfrac': 0.0009741841349750757, 'actor/ppo_kl': -0.0012612767750397325}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.4468320906162262, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000678975717164576}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.196271151304245, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004333632532507181}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0001925567485159263, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001557067735120654}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.3978947103023529, 'actor/pg_clipfrac': 0.0007855459698475897, 'actor/ppo_kl': 0.0021205581724643707}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.18918181955814362, 'actor/pg_clipfrac': 0.0009372071363031864, 'actor/ppo_kl': -0.0008563459268771112}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00022638399968855083, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007828799425624311}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.07550445944070816, 'actor/pg_clipfrac': 0.0018814675277099013, 'actor/ppo_kl': 0.0001376592554152012}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3421483039855957, 'actor/pg_clipfrac': 0.0015267175622284412, 'actor/ppo_kl': -0.0006348864990286529}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.16467304527759552, 'actor/pg_clipfrac': 0.000584112131036818, 'actor/ppo_kl': -0.000180467264726758}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.019346699118614197, 'actor/pg_clipfrac': 0.0009765625, 'actor/ppo_kl': -0.0014967825263738632}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1700173318386078, 'actor/pg_clipfrac': 0.0012722646351903677, 'actor/ppo_kl': -0.00029041929519735277}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.11611463129520416, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014641440939158201}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.23399993777275085, 'actor/pg_clipfrac': 0.0013850415125489235, 'actor/ppo_kl': -0.0011962298303842545}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.05259108170866966, 'actor/pg_clipfrac': 0.0013351135421544313, 'actor/ppo_kl': -0.0001269914791919291}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00024165261129382998, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00047597073717042804}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.08418460935354233, 'actor/pg_clipfrac': 0.0008467400330118835, 'actor/ppo_kl': -0.00017898068472277373}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00027727874112315476, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00025138998171314597}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.08899571746587753, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009903694735839963}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002518057881388813, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008450732566416264}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.1487719565629959, 'actor/pg_clipfrac': 0.0020120723638683558, 'actor/ppo_kl': 0.0008484957506880164}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.16400086879730225, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001846706378273666}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.29206761717796326, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016219554236158729}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.42815181612968445, 'actor/pg_clipfrac': 0.002298850566148758, 'actor/ppo_kl': 0.0026557703968137503}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00020694754493888468, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011417422210797668}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.042087458074092865, 'actor/pg_clipfrac': 0.0024271844886243343, 'actor/ppo_kl': -0.0009109117672778666}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.20936857163906097, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.003391720587387681}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1970183104276657, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005080958362668753}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0003543803468346596, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018228677799925208}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3513937294483185, 'actor/pg_clipfrac': 0.0009578543831594288, 'actor/ppo_kl': 0.0014483444392681122}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2578432559967041, 'actor/pg_clipfrac': 0.0025445292703807354, 'actor/ppo_kl': 0.0008954638033173978}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.19259236752986908, 'actor/pg_clipfrac': 0.00206611561588943, 'actor/ppo_kl': -0.0016996191116049886}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.11483649909496307, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002593209268525243}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.5966981649398804, 'actor/pg_clipfrac': 0.0009699321235530078, 'actor/ppo_kl': -0.0013486490352079272}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.4623807966709137, 'actor/pg_clipfrac': 0.003481894265860319, 'actor/ppo_kl': 0.0002926000452134758}
[36m(Runner pid=3309020)[0m Step 36
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.26
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.034
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.009
[36m(Runner pid=3309020)[0m ppo_kl: -3.5877139378825974e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:14<1:00:51, 2.86s/it, est. speed input: 155.40 toks/s, output: 21.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:22<44:50, 2.12s/it, est. speed input: 206.02 toks/s, output: 37.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:27<34:26, 1.63s/it, est. speed input: 250.70 toks/s, output: 54.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:29<23:22, 1.11s/it, est. speed input: 308.48 toks/s, output: 76.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<16:09, 1.29it/s, est. speed input: 376.32 toks/s, output: 98.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<11:28, 1.81it/s, est. speed input: 447.58 toks/s, output: 119.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<07:59, 2.60it/s, est. speed input: 514.16 toks/s, output: 140.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<06:13, 3.32it/s, est. speed input: 574.14 toks/s, output: 161.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:31<04:32, 4.53it/s, est. speed input: 643.00 toks/s, output: 183.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<05:37, 3.65it/s, est. speed input: 670.63 toks/s, output: 190.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<05:15, 3.88it/s, est. speed input: 716.20 toks/s, output: 208.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<04:09, 4.90it/s, est. speed input: 771.45 toks/s, output: 225.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<02:56, 6.87it/s, est. speed input: 881.27 toks/s, output: 262.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:30, 8.03it/s, est. speed input: 939.75 toks/s, output: 283.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:36<02:04, 9.63it/s, est. speed input: 998.75 toks/s, output: 304.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<01:52, 10.55it/s, est. speed input: 1095.85 toks/s, output: 341.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:37<01:22, 14.24it/s, est. speed input: 1257.97 toks/s, output: 408.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<01:28, 13.24it/s, est. speed input: 1303.61 toks/s, output: 427.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:39<01:24, 13.73it/s, est. speed input: 1395.33 toks/s, output: 472.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:39<01:13, 15.70it/s, est. speed input: 1450.61 toks/s, output: 492.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:39<00:57, 19.92it/s, est. speed input: 1556.35 toks/s, output: 535.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:39<00:57, 19.98it/s, est. speed input: 1606.79 toks/s, output: 557.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:40<00:40, 27.78it/s, est. speed input: 1777.28 toks/s, output: 621.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:40<00:35, 31.07it/s, est. speed input: 1873.47 toks/s, output: 666.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:40<00:36, 30.22it/s, est. speed input: 2011.62 toks/s, output: 729.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:41<00:38, 28.55it/s, est. speed input: 2051.59 toks/s, output: 755.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:41<00:27, 39.40it/s, est. speed input: 2205.58 toks/s, output: 828.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:41<00:28, 37.54it/s, est. speed input: 2252.48 toks/s, output: 852.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:41<00:31, 33.93it/s, est. speed input: 2296.86 toks/s, output: 875.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:41<00:26, 40.64it/s, est. speed input: 2397.12 toks/s, output: 922.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:41<00:20, 50.13it/s, est. speed input: 2554.19 toks/s, output: 991.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:42<00:16, 61.36it/s, est. speed input: 2712.78 toks/s, output: 1081.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:42<00:17, 57.47it/s, est. speed input: 2806.30 toks/s, output: 1126.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:42<00:15, 64.83it/s, est. speed input: 2907.22 toks/s, output: 1168.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:42<00:18, 55.07it/s, est. speed input: 2998.68 toks/s, output: 1216.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:42<00:17, 57.88it/s, est. speed input: 3091.27 toks/s, output: 1257.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:43<00:19, 51.30it/s, est. speed input: 3182.46 toks/s, output: 1302.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:43<00:13, 71.60it/s, est. speed input: 3380.87 toks/s, output: 1389.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:43<00:16, 56.63it/s, est. speed input: 3462.71 toks/s, output: 1424.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:43<00:20, 46.60it/s, est. speed input: 3543.88 toks/s, output: 1464.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:43<00:17, 52.13it/s, est. speed input: 3641.93 toks/s, output: 1522.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:44<00:17, 53.55it/s, est. speed input: 3740.40 toks/s, output: 1570.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:44<00:15, 58.89it/s, est. speed input: 3876.04 toks/s, output: 1629.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:44<00:11, 74.53it/s, est. speed input: 4070.65 toks/s, output: 1737.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:44<00:10, 79.72it/s, est. speed input: 4309.71 toks/s, output: 1868.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:44<00:11, 71.50it/s, est. speed input: 4396.81 toks/s, output: 1908.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:45<00:11, 73.91it/s, est. speed input: 4492.22 toks/s, output: 1958.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:45<00:11, 70.88it/s, est. speed input: 4579.80 toks/s, output: 2010.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:45<00:09, 83.57it/s, est. speed input: 4718.48 toks/s, output: 2099.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:45<00:09, 84.67it/s, est. speed input: 4812.38 toks/s, output: 2156.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:45<00:11, 70.67it/s, est. speed input: 4892.17 toks/s, output: 2212.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:45<00:10, 76.29it/s, est. speed input: 4986.45 toks/s, output: 2274.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:45<00:08, 86.50it/s, est. speed input: 5128.00 toks/s, output: 2363.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:46<00:08, 92.11it/s, est. speed input: 5318.16 toks/s, output: 2458.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:46<00:08, 88.27it/s, est. speed input: 5404.03 toks/s, output: 2508.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:46<00:09, 80.52it/s, est. speed input: 5490.16 toks/s, output: 2560.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:06, 114.77it/s, est. speed input: 5770.57 toks/s, output: 2742.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:46<00:09, 74.73it/s, est. speed input: 5867.18 toks/s, output: 2800.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:47<00:08, 80.86it/s, est. speed input: 5999.55 toks/s, output: 2887.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:47<00:06, 98.92it/s, est. speed input: 6179.17 toks/s, output: 2986.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:47<00:05, 106.07it/s, est. speed input: 6311.68 toks/s, output: 3065.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:47<00:05, 112.26it/s, est. speed input: 6444.03 toks/s, output: 3126.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:47<00:05, 108.98it/s, est. speed input: 6575.83 toks/s, output: 3218.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:47<00:05, 106.55it/s, est. speed input: 6707.12 toks/s, output: 3309.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:47<00:04, 132.77it/s, est. speed input: 6931.91 toks/s, output: 3474.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:48<00:04, 135.56it/s, est. speed input: 7105.10 toks/s, output: 3594.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:48<00:03, 134.13it/s, est. speed input: 7224.71 toks/s, output: 3687.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:48<00:04, 115.93it/s, est. speed input: 7389.56 toks/s, output: 3799.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:48<00:04, 99.09it/s, est. speed input: 7500.50 toks/s, output: 3874.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:48<00:05, 85.00it/s, est. speed input: 7607.41 toks/s, output: 3973.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:48<00:05, 87.48it/s, est. speed input: 7727.30 toks/s, output: 4051.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:49<00:07, 62.18it/s, est. speed input: 7768.92 toks/s, output: 4090.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:49<00:04, 91.07it/s, est. speed input: 8029.26 toks/s, output: 4278.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:49<00:03, 111.41it/s, est. speed input: 8247.92 toks/s, output: 4428.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:03, 121.22it/s, est. speed input: 8416.78 toks/s, output: 4543.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:49<00:03, 105.03it/s, est. speed input: 8524.33 toks/s, output: 4610.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:50<00:02, 121.02it/s, est. speed input: 8698.71 toks/s, output: 4741.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:50<00:02, 137.26it/s, est. speed input: 8869.44 toks/s, output: 4867.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:50<00:02, 104.66it/s, est. speed input: 9014.02 toks/s, output: 4979.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:50<00:02, 111.22it/s, est. speed input: 9127.65 toks/s, output: 5073.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:50<00:02, 113.97it/s, est. speed input: 9231.43 toks/s, output: 5167.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:50<00:02, 105.11it/s, est. speed input: 9343.42 toks/s, output: 5286.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:51<00:02, 107.49it/s, est. speed input: 9498.35 toks/s, output: 5404.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:51<00:02, 104.22it/s, est. speed input: 9607.29 toks/s, output: 5504.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:51<00:02, 80.76it/s, est. speed input: 9691.72 toks/s, output: 5571.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:51<00:01, 117.73it/s, est. speed input: 9955.38 toks/s, output: 5787.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:51<00:01, 107.89it/s, est. speed input: 10061.94 toks/s, output: 5914.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:01, 99.58it/s, est. speed input: 10165.62 toks/s, output: 6019.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:52<00:01, 110.61it/s, est. speed input: 10323.41 toks/s, output: 6147.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:52<00:00, 117.32it/s, est. speed input: 10435.61 toks/s, output: 6243.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:52<00:01, 71.37it/s, est. speed input: 10492.43 toks/s, output: 6340.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:52<00:01, 80.09it/s, est. speed input: 10595.71 toks/s, output: 6456.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:53<00:01, 61.34it/s, est. speed input: 10644.56 toks/s, output: 6535.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:53<00:00, 61.94it/s, est. speed input: 10699.54 toks/s, output: 6614.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:53<00:00, 60.44it/s, est. speed input: 10747.96 toks/s, output: 6674.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:53<00:00, 57.33it/s, est. speed input: 10791.88 toks/s, output: 6716.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:53<00:00, 55.85it/s, est. speed input: 10870.33 toks/s, output: 6821.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 39.10it/s, est. speed input: 10869.71 toks/s, output: 6855.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:54<00:00, 39.73it/s, est. speed input: 10886.77 toks/s, output: 6880.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:55<00:00, 17.87it/s, est. speed input: 10718.14 toks/s, output: 6803.67 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:55<00:00, 22.93it/s, est. speed input: 10718.14 toks/s, output: 6803.67 toks/s]
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.644
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.644
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 970528
[36m(Runner pid=3309020)[0m balanced_min: 970031
[36m(Runner pid=3309020)[0m max: 977755
[36m(Runner pid=3309020)[0m mean: 970279.5
[36m(Runner pid=3309020)[0m min: 962804
[36m(Runner pid=3309020)[0m minmax_diff: 14951
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 112.047
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.119
[36m(Runner pid=3309020)[0m throughput: 1108.532
[36m(Runner pid=3309020)[0m time_per_step: 875.283
[36m(Runner pid=3309020)[0m total_num_tokens: 1940559
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 667.0
[36m(Runner pid=3309020)[0m mean: 464.299
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2581.0
[36m(Runner pid=3309020)[0m mean: 293.732
[36m(Runner pid=3309020)[0m min: 49.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.291
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.644
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.431431300737994e-05
[36m(Runner pid=3309020)[0m gen: 0.169
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.291
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.164
[36m(Runner pid=3309020)[0m gen: 127.352
[36m(Runner pid=3309020)[0m old: 88.006
[36m(Runner pid=3309020)[0m ref: 87.551
[36m(Runner pid=3309020)[0m reward: 7.007
[36m(Runner pid=3309020)[0m step: 875.283
[36m(Runner pid=3309020)[0m update_actor: 564.469
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 37; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:28:19 [executor_base.py:219] It took 0.339412 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.77 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:28:19 [executor_base.py:219] It took 0.339710 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:29:46 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:29:46 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:29:46 [executor_base.py:208] It took 0.325548 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.85 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:29:56 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:29:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:29:56 [executor_base.py:208] It took 0.329808 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.4475821852684021, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002249469398520887}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5147634744644165, 'actor/pg_clipfrac': 0.0020920501556247473, 'actor/ppo_kl': -0.00014095759252086282}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.10603539645671844, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.21712753176689148, 'actor/pg_clipfrac': 0.001861042226664722, 'actor/ppo_kl': -0.0009476399281993508}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3412545919418335, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3613321781158447, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002813587198033929}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00033431462361477315, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.2728697955608368, 'actor/pg_clipfrac': 0.0010976948542520404, 'actor/ppo_kl': -0.0003236965276300907}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.4474382698535919, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.08743735402822495, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.1662496030330658, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007217231905087829}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.16898499429225922, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.08569765090942383, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2776443362236023, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003054791013710201}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.24134895205497742, 'actor/pg_clipfrac': 0.0026490066666156054, 'actor/ppo_kl': 0.00023593270452693105}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.17525286972522736, 'actor/pg_clipfrac': 0.0009066183120012283, 'actor/ppo_kl': 0.00059614417841658}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00027440398116596043, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004956829361617565}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0001717451523290947, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001118868007324636}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.5436524748802185, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016869709361344576}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.49939221143722534, 'actor/pg_clipfrac': 0.001172332908026874, 'actor/ppo_kl': 0.0006693027098663151}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.000225417606998235, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003816696407739073}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.4385450780391693, 'actor/pg_clipfrac': 0.0021413275972008705, 'actor/ppo_kl': -0.00016330089420080185}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.04513761028647423, 'actor/pg_clipfrac': 0.0007215007208287716, 'actor/ppo_kl': -0.0007987180724740028}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1668197065591812, 'actor/pg_clipfrac': 0.0017873101169243455, 'actor/ppo_kl': -0.002523902803659439}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00030485796742141247, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008910396718420088}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00017513881903141737, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00045191572280600667}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00023854344908613712, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006095760618336499}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00021587575611192733, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010617610532790422}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00021575823484454304, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00035157319507561624}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.11551808565855026, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001656946144066751}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3182527422904968, 'actor/pg_clipfrac': 0.002109704539179802, 'actor/ppo_kl': 0.0006102147744968534}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.09928504377603531, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00018992333207279444}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00027094408869743347, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016162096289917827}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.3410053253173828, 'actor/pg_clipfrac': 0.005132591817528009, 'actor/ppo_kl': -0.00036744281533174217}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.7064898014068604, 'actor/pg_clipfrac': 0.0005491488263942301, 'actor/ppo_kl': 0.000987639999948442}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.6049483418464661, 'actor/pg_clipfrac': 0.0007874015718698502, 'actor/ppo_kl': 0.00011876624193973839}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.22360236942768097, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008745542727410793}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3192538022994995, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000624665233772248}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00025829856167547405, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00035109862801618874}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.24765758216381073, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005561428260989487}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0001638250978430733, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00045526158646680415}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.1611194610595703, 'actor/pg_clipfrac': 0.003225806402042508, 'actor/ppo_kl': 0.00039510009810328484}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.03381311893463135, 'actor/pg_clipfrac': 0.004725898150354624, 'actor/ppo_kl': 0.0006455708062276244}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.23217624425888062, 'actor/pg_clipfrac': 0.0018281536176800728, 'actor/ppo_kl': -0.0003349663456901908}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00015196131425909698, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00026689714286476374}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00015738925139885396, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00036454707151278853}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.20879893004894257, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004229158512316644}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.10739782452583313, 'actor/pg_clipfrac': 0.0014367816038429737, 'actor/ppo_kl': -0.00045320906792767346}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00028119838680140674, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0026477687060832977}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5967218279838562, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00034844965557567775}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.5708070993423462, 'actor/pg_clipfrac': 0.0017467249417677522, 'actor/ppo_kl': -0.0017391821602359414}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.36925840377807617, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001652281265705824}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.14648689329624176, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002723130746744573}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.29727739095687866, 'actor/pg_clipfrac': 0.0037499999161809683, 'actor/ppo_kl': 4.479884955799207e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.022534575313329697, 'actor/pg_clipfrac': 0.002344665816053748, 'actor/ppo_kl': -0.0005275730509310961}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.2571711540222168, 'actor/pg_clipfrac': 0.0008620689623057842, 'actor/ppo_kl': 0.0003175899910274893}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.05211833491921425, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010774167021736503}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.9656208753585815, 'actor/pg_clipfrac': 0.0015197568573057652, 'actor/ppo_kl': 0.002111953916028142}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.12254927307367325, 'actor/pg_clipfrac': 0.0021367522422224283, 'actor/ppo_kl': -0.0003214412136003375}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.000204061419935897, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003673829196486622}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.03029015101492405, 'actor/pg_clipfrac': 0.0005249343812465668, 'actor/ppo_kl': 0.0005070053157396615}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:14<1:00:10, 2.83s/it, est. speed input: 151.85 toks/s, output: 19.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:24<50:12, 2.37s/it, est. speed input: 182.52 toks/s, output: 35.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<29:38, 1.41s/it, est. speed input: 259.62 toks/s, output: 54.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:29<23:31, 1.12s/it, est. speed input: 306.72 toks/s, output: 69.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<16:30, 1.27it/s, est. speed input: 368.71 toks/s, output: 92.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<12:11, 1.71it/s, est. speed input: 432.12 toks/s, output: 109.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<09:47, 2.12it/s, est. speed input: 491.20 toks/s, output: 124.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:32<07:19, 2.82it/s, est. speed input: 554.56 toks/s, output: 144.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<06:13, 3.30it/s, est. speed input: 605.09 toks/s, output: 153.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<04:48, 4.27it/s, est. speed input: 664.66 toks/s, output: 175.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<04:50, 4.22it/s, est. speed input: 708.70 toks/s, output: 189.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<03:43, 5.45it/s, est. speed input: 765.19 toks/s, output: 215.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<02:28, 8.15it/s, est. speed input: 879.34 toks/s, output: 252.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:22, 8.46it/s, est. speed input: 934.58 toks/s, output: 267.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<02:12, 9.08it/s, est. speed input: 986.95 toks/s, output: 290.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:37<01:07, 17.58it/s, est. speed input: 1173.12 toks/s, output: 361.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:37<00:45, 25.96it/s, est. speed input: 1351.29 toks/s, output: 433.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<00:31, 36.41it/s, est. speed input: 1534.47 toks/s, output: 499.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:31, 36.16it/s, est. speed input: 1644.14 toks/s, output: 540.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:27, 41.83it/s, est. speed input: 1756.33 toks/s, output: 587.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:38, 29.01it/s, est. speed input: 1845.39 toks/s, output: 626.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:39<00:49, 22.52it/s, est. speed input: 1878.76 toks/s, output: 638.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:39<00:40, 27.43it/s, est. speed input: 2034.00 toks/s, output: 702.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:41, 26.35it/s, est. speed input: 2128.70 toks/s, output: 741.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:40<00:38, 28.18it/s, est. speed input: 2177.68 toks/s, output: 761.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:40, 26.61it/s, est. speed input: 2224.39 toks/s, output: 782.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:40<00:42, 25.41it/s, est. speed input: 2267.25 toks/s, output: 798.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:38, 28.20it/s, est. speed input: 2318.95 toks/s, output: 823.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:34, 31.07it/s, est. speed input: 2368.27 toks/s, output: 843.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:31, 33.84it/s, est. speed input: 2415.31 toks/s, output: 872.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:41<00:48, 22.07it/s, est. speed input: 2449.74 toks/s, output: 889.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:41<00:29, 35.38it/s, est. speed input: 2597.92 toks/s, output: 961.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:41<00:24, 41.89it/s, est. speed input: 2705.42 toks/s, output: 1011.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:42<00:27, 37.64it/s, est. speed input: 2842.54 toks/s, output: 1089.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:42<00:19, 52.16it/s, est. speed input: 3043.12 toks/s, output: 1190.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:42<00:19, 52.05it/s, est. speed input: 3139.18 toks/s, output: 1237.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:13, 69.80it/s, est. speed input: 3341.39 toks/s, output: 1326.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:42<00:13, 72.81it/s, est. speed input: 3492.25 toks/s, output: 1413.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:11, 82.00it/s, est. speed input: 3669.72 toks/s, output: 1476.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:43<00:10, 89.82it/s, est. speed input: 3822.17 toks/s, output: 1557.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:43<00:07, 116.30it/s, est. speed input: 4069.02 toks/s, output: 1695.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:43<00:10, 87.90it/s, est. speed input: 4202.67 toks/s, output: 1761.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:09, 89.63it/s, est. speed input: 4350.52 toks/s, output: 1839.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:44<00:12, 68.37it/s, est. speed input: 4468.96 toks/s, output: 1901.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:44<00:13, 61.10it/s, est. speed input: 4558.11 toks/s, output: 1958.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:10, 81.58it/s, est. speed input: 4764.32 toks/s, output: 2068.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:44<00:06, 130.12it/s, est. speed input: 5167.80 toks/s, output: 2298.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:44<00:07, 103.62it/s, est. speed input: 5333.48 toks/s, output: 2393.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:44<00:07, 105.96it/s, est. speed input: 5466.18 toks/s, output: 2477.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:45<00:08, 88.70it/s, est. speed input: 5590.86 toks/s, output: 2545.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:45<00:07, 93.92it/s, est. speed input: 5769.28 toks/s, output: 2635.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:45<00:08, 79.53it/s, est. speed input: 5887.10 toks/s, output: 2688.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:45<00:07, 91.66it/s, est. speed input: 6077.10 toks/s, output: 2808.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:45<00:06, 97.98it/s, est. speed input: 6268.06 toks/s, output: 2900.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:46<00:07, 91.15it/s, est. speed input: 6384.48 toks/s, output: 2984.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:46<00:05, 121.15it/s, est. speed input: 6669.86 toks/s, output: 3155.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:46<00:03, 158.81it/s, est. speed input: 6995.66 toks/s, output: 3368.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:46<00:03, 153.29it/s, est. speed input: 7165.02 toks/s, output: 3457.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:46<00:04, 123.16it/s, est. speed input: 7335.56 toks/s, output: 3551.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:46<00:04, 116.34it/s, est. speed input: 7508.65 toks/s, output: 3655.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:47<00:03, 128.33it/s, est. speed input: 7778.59 toks/s, output: 3824.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:47<00:03, 130.67it/s, est. speed input: 7951.33 toks/s, output: 3945.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:47<00:02, 163.42it/s, est. speed input: 8231.85 toks/s, output: 4105.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:47<00:02, 164.84it/s, est. speed input: 8406.01 toks/s, output: 4236.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:47<00:02, 153.26it/s, est. speed input: 8580.23 toks/s, output: 4350.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:47<00:02, 179.85it/s, est. speed input: 8852.66 toks/s, output: 4563.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:48<00:03, 116.24it/s, est. speed input: 8980.25 toks/s, output: 4671.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:48<00:03, 102.82it/s, est. speed input: 9139.14 toks/s, output: 4778.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:48<00:03, 101.71it/s, est. speed input: 9256.92 toks/s, output: 4865.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:48<00:02, 117.12it/s, est. speed input: 9423.53 toks/s, output: 4980.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:48<00:02, 115.37it/s, est. speed input: 9538.27 toks/s, output: 5073.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:48<00:02, 120.87it/s, est. speed input: 9663.43 toks/s, output: 5168.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:49<00:02, 108.91it/s, est. speed input: 9772.59 toks/s, output: 5246.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:49<00:03, 71.56it/s, est. speed input: 9836.95 toks/s, output: 5319.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:49<00:02, 79.89it/s, est. speed input: 9947.99 toks/s, output: 5416.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:49<00:02, 78.39it/s, est. speed input: 10062.01 toks/s, output: 5496.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:50<00:02, 73.07it/s, est. speed input: 10150.17 toks/s, output: 5577.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:50<00:02, 78.14it/s, est. speed input: 10261.37 toks/s, output: 5671.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:50<00:01, 82.66it/s, est. speed input: 10371.58 toks/s, output: 5753.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:50<00:01, 89.84it/s, est. speed input: 10480.62 toks/s, output: 5861.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:50<00:01, 88.35it/s, est. speed input: 10543.40 toks/s, output: 5918.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:50<00:01, 89.17it/s, est. speed input: 10609.74 toks/s, output: 5974.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:50<00:01, 86.85it/s, est. speed input: 10674.52 toks/s, output: 6065.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:50<00:01, 87.72it/s, est. speed input: 10740.94 toks/s, output: 6165.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:51<00:01, 88.59it/s, est. speed input: 10805.01 toks/s, output: 6214.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:51<00:00, 97.78it/s, est. speed input: 10900.34 toks/s, output: 6306.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:51<00:00, 92.84it/s, est. speed input: 10977.23 toks/s, output: 6384.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:51<00:00, 64.29it/s, est. speed input: 11005.74 toks/s, output: 6407.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:51<00:00, 70.97it/s, est. speed input: 11076.37 toks/s, output: 6484.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:51<00:00, 57.05it/s, est. speed input: 11121.30 toks/s, output: 6547.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:52<00:00, 51.92it/s, est. speed input: 11161.03 toks/s, output: 6614.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 44.88it/s, est. speed input: 11188.71 toks/s, output: 6681.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:53<00:00, 33.13it/s, est. speed input: 11175.68 toks/s, output: 6721.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 26.87it/s, est. speed input: 11157.85 toks/s, output: 6747.75 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 23.90it/s, est. speed input: 11157.85 toks/s, output: 6747.75 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.05636516958475113, 'actor/pg_clipfrac': 0.0008944543660618365, 'actor/ppo_kl': 0.002771328203380108}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.17834463715553284, 'actor/pg_clipfrac': 0.00438116118311882, 'actor/ppo_kl': -0.0025017540901899338}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.11720864474773407, 'actor/pg_clipfrac': 0.0036900369450449944, 'actor/ppo_kl': 0.0033857724629342556}
[36m(Runner pid=3309020)[0m Step 37
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.254
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.024
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.011
[36m(Runner pid=3309020)[0m ppo_kl: 4.1725385727797716e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.642
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.642
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 969751
[36m(Runner pid=3309020)[0m balanced_min: 968950
[36m(Runner pid=3309020)[0m max: 977926
[36m(Runner pid=3309020)[0m mean: 969350.5
[36m(Runner pid=3309020)[0m min: 960775
[36m(Runner pid=3309020)[0m minmax_diff: 17151
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 112.012
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.119
[36m(Runner pid=3309020)[0m throughput: 1126.036
[36m(Runner pid=3309020)[0m time_per_step: 860.853
[36m(Runner pid=3309020)[0m total_num_tokens: 1938701
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 634.0
[36m(Runner pid=3309020)[0m mean: 464.35
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2584.0
[36m(Runner pid=3309020)[0m mean: 292.955
[36m(Runner pid=3309020)[0m min: 52.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.286
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.642
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.15
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.047
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.291
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.301
[36m(Runner pid=3309020)[0m gen: 112.378
[36m(Runner pid=3309020)[0m old: 86.322
[36m(Runner pid=3309020)[0m ref: 90.949
[36m(Runner pid=3309020)[0m reward: 6.727
[36m(Runner pid=3309020)[0m step: 860.853
[36m(Runner pid=3309020)[0m update_actor: 563.532
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 38; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:42:40 [executor_base.py:219] It took 0.339961 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.79 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:44:06 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:42:40 [executor_base.py:219] It took 0.345736 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:44:06 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.87 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:44:06 [executor_base.py:208] It took 0.325248 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.87 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:44:18 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:44:19 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:44:19 [executor_base.py:208] It took 0.328922 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.27430257201194763, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001195570221170783}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00023066070571076125, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.5424044728279114, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.14967817068099976, 'actor/pg_clipfrac': 0.001550387591123581, 'actor/ppo_kl': -0.0005517530953511596}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00021075115364510566, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.003031593980267644}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.16583184897899628, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1776878386735916, 'actor/pg_clipfrac': 0.0010526315309107304, 'actor/ppo_kl': 0.0003013846871908754}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.534943699836731, 'actor/pg_clipfrac': 0.0011261261533945799, 'actor/ppo_kl': -0.0004244924639351666}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.918285071849823, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.23596329987049103, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.5212883353233337, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1155923455953598, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.22692543268203735, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.11532745510339737, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0030413768254220486}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.029205109924077988, 'actor/pg_clipfrac': 0.001970443408936262, 'actor/ppo_kl': 0.000811756297480315}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.09696054458618164, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.25370410084724426, 'actor/pg_clipfrac': 0.0025542783550918102, 'actor/ppo_kl': 0.0016215459909290075}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.3277977406978607, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001179858882096596}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.471628874540329, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.022006654238794e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.34689703583717346, 'actor/pg_clipfrac': 0.0009689922444522381, 'actor/ppo_kl': 0.000139423122163862}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.04493892937898636, 'actor/pg_clipfrac': 0.0006954103009775281, 'actor/ppo_kl': -3.2061496312962845e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.18114568293094635, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000398480478907004}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4482583701610565, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009595449082553387}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.6089320778846741, 'actor/pg_clipfrac': 0.0033333334140479565, 'actor/ppo_kl': -0.0010457788594067097}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.7620140910148621, 'actor/pg_clipfrac': 0.0026007802225649357, 'actor/ppo_kl': 2.3252787286764942e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002429406449664384, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008007955038920045}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00026299883029423654, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00013361194578465074}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.09750176221132278, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00024349162413273007}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.17484788596630096, 'actor/pg_clipfrac': 0.0011402508243918419, 'actor/ppo_kl': 0.00041171544580720365}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.006632843986153603, 'actor/pg_clipfrac': 0.0023894861806184053, 'actor/ppo_kl': 0.0015869266353547573}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00018402055138722062, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018165213987231255}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.257938951253891, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006482566241174936}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.3857496678829193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013087926199659705}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.27424079179763794, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008955070516094565}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.15305747091770172, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009925008052960038}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00023071070609148592, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00034980630152858794}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.045679740607738495, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011121559655293822}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.03696027398109436, 'actor/pg_clipfrac': 0.00040290088509209454, 'actor/ppo_kl': -0.0004010919074062258}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00023047545982990414, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00017002676031552255}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00023762976343277842, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00013929280976299196}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.24186095595359802, 'actor/pg_clipfrac': 0.0017123287543654442, 'actor/ppo_kl': 0.000662019825540483}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.3848014175891876, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011808970011770725}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.43725186586380005, 'actor/pg_clipfrac': 0.0006169031257741153, 'actor/ppo_kl': -3.4671149478526786e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2404032200574875, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009456194820813835}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.5679612159729004, 'actor/pg_clipfrac': 0.0021482277661561966, 'actor/ppo_kl': 0.0004743418248835951}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00024164312344510108, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012912285747006536}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.19120116531848907, 'actor/pg_clipfrac': 0.0012468828354030848, 'actor/ppo_kl': 0.00010810766252689064}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4604765772819519, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00130627048201859}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00016142975073307753, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014015563065186143}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5492868423461914, 'actor/pg_clipfrac': 0.0011983223957940936, 'actor/ppo_kl': -0.00029956572689116}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.4061923027038574, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007130670128390193}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.18688811361789703, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007154354243539274}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.10405847430229187, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013908499386161566}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.203328937292099, 'actor/pg_clipfrac': 0.0011428571306169033, 'actor/ppo_kl': 0.0018835602095350623}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.14437656104564667, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002744250523392111}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:15<1:07:48, 3.19s/it, est. speed input: 142.27 toks/s, output: 20.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<54:24, 2.57s/it, est. speed input: 172.89 toks/s, output: 35.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:32<41:08, 1.95s/it, est. speed input: 210.95 toks/s, output: 49.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:33<26:31, 1.26s/it, est. speed input: 274.64 toks/s, output: 69.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:33<17:13, 1.21it/s, est. speed input: 341.19 toks/s, output: 89.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:34<12:31, 1.66it/s, est. speed input: 396.11 toks/s, output: 104.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:35<08:55, 2.33it/s, est. speed input: 453.91 toks/s, output: 124.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:35<06:27, 3.20it/s, est. speed input: 509.12 toks/s, output: 144.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:36<05:06, 4.03it/s, est. speed input: 559.33 toks/s, output: 164.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:36<04:05, 5.02it/s, est. speed input: 611.89 toks/s, output: 184.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:36<03:06, 6.58it/s, est. speed input: 671.35 toks/s, output: 206.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:37<01:55, 10.53it/s, est. speed input: 789.43 toks/s, output: 256.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:37<01:18, 15.26it/s, est. speed input: 912.48 toks/s, output: 301.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<01:08, 17.45it/s, est. speed input: 971.35 toks/s, output: 322.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:37<01:20, 14.84it/s, est. speed input: 1019.51 toks/s, output: 342.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:38<01:08, 17.46it/s, est. speed input: 1078.91 toks/s, output: 367.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:38<00:45, 25.67it/s, est. speed input: 1199.63 toks/s, output: 418.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<00:34, 33.92it/s, est. speed input: 1311.68 toks/s, output: 467.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:41, 27.99it/s, est. speed input: 1417.07 toks/s, output: 513.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:38<00:39, 29.48it/s, est. speed input: 1469.67 toks/s, output: 531.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:39<00:35, 32.04it/s, est. speed input: 1576.45 toks/s, output: 569.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:39<00:34, 33.13it/s, est. speed input: 1631.39 toks/s, output: 590.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:39<00:45, 24.91it/s, est. speed input: 1673.04 toks/s, output: 612.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:39<00:44, 25.59it/s, est. speed input: 1722.76 toks/s, output: 629.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:40<00:42, 26.65it/s, est. speed input: 1823.08 toks/s, output: 677.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:40<00:31, 35.79it/s, est. speed input: 1932.69 toks/s, output: 724.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:40<00:26, 41.23it/s, est. speed input: 2031.41 toks/s, output: 762.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:20, 53.06it/s, est. speed input: 2215.98 toks/s, output: 836.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:17, 59.83it/s, est. speed input: 2319.41 toks/s, output: 879.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:41<00:20, 52.79it/s, est. speed input: 2475.12 toks/s, output: 941.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:41<00:20, 50.42it/s, est. speed input: 2570.79 toks/s, output: 988.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:41<00:13, 74.60it/s, est. speed input: 2838.00 toks/s, output: 1112.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:16, 61.67it/s, est. speed input: 2929.94 toks/s, output: 1160.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:14, 67.34it/s, est. speed input: 3034.65 toks/s, output: 1206.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:42<00:14, 66.67it/s, est. speed input: 3132.98 toks/s, output: 1250.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:42<00:13, 72.74it/s, est. speed input: 3235.38 toks/s, output: 1300.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:42<00:15, 64.20it/s, est. speed input: 3325.30 toks/s, output: 1357.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:42<00:13, 71.05it/s, est. speed input: 3424.51 toks/s, output: 1406.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:42<00:12, 77.15it/s, est. speed input: 3529.40 toks/s, output: 1452.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:42<00:14, 66.27it/s, est. speed input: 3620.89 toks/s, output: 1507.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:42<00:15, 60.63it/s, est. speed input: 3712.46 toks/s, output: 1554.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:43<00:12, 71.76it/s, est. speed input: 3863.72 toks/s, output: 1628.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:43<00:10, 82.25it/s, est. speed input: 4058.02 toks/s, output: 1716.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:13, 67.11it/s, est. speed input: 4142.54 toks/s, output: 1758.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:12, 67.94it/s, est. speed input: 4237.22 toks/s, output: 1806.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:15, 56.49it/s, est. speed input: 4316.01 toks/s, output: 1849.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:44<00:16, 53.42it/s, est. speed input: 4399.07 toks/s, output: 1896.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:44<00:11, 70.54it/s, est. speed input: 4586.24 toks/s, output: 2007.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:44<00:11, 72.39it/s, est. speed input: 4678.19 toks/s, output: 2061.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:44<00:16, 50.84it/s, est. speed input: 4744.29 toks/s, output: 2097.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:44<00:14, 57.35it/s, est. speed input: 4871.90 toks/s, output: 2181.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:45<00:14, 55.43it/s, est. speed input: 4954.52 toks/s, output: 2225.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:45<00:13, 57.10it/s, est. speed input: 5037.10 toks/s, output: 2278.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:45<00:11, 67.21it/s, est. speed input: 5173.09 toks/s, output: 2350.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:45<00:11, 63.80it/s, est. speed input: 5256.60 toks/s, output: 2407.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:45<00:12, 62.16it/s, est. speed input: 5344.27 toks/s, output: 2452.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:45<00:11, 64.89it/s, est. speed input: 5433.01 toks/s, output: 2508.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:46<00:11, 63.34it/s, est. speed input: 5509.93 toks/s, output: 2554.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:46<00:12, 59.28it/s, est. speed input: 5587.66 toks/s, output: 2597.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:46<00:12, 57.09it/s, est. speed input: 5662.29 toks/s, output: 2650.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:11, 62.22it/s, est. speed input: 5747.07 toks/s, output: 2696.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:46<00:10, 64.88it/s, est. speed input: 5864.19 toks/s, output: 2776.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:47<00:09, 68.76it/s, est. speed input: 5944.51 toks/s, output: 2817.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:47<00:09, 66.18it/s, est. speed input: 6059.86 toks/s, output: 2883.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:47<00:08, 79.30it/s, est. speed input: 6192.21 toks/s, output: 2952.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:47<00:07, 80.97it/s, est. speed input: 6280.44 toks/s, output: 3004.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:47<00:07, 82.51it/s, est. speed input: 6361.96 toks/s, output: 3052.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:47<00:08, 76.19it/s, est. speed input: 6437.31 toks/s, output: 3094.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:47<00:08, 73.64it/s, est. speed input: 6509.33 toks/s, output: 3143.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:48<00:06, 93.81it/s, est. speed input: 6723.85 toks/s, output: 3292.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:48<00:05, 101.52it/s, est. speed input: 6848.04 toks/s, output: 3396.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:48<00:05, 102.82it/s, est. speed input: 6967.58 toks/s, output: 3464.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:48<00:04, 120.14it/s, est. speed input: 7216.52 toks/s, output: 3629.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:48<00:03, 135.81it/s, est. speed input: 7384.68 toks/s, output: 3774.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:48<00:03, 127.35it/s, est. speed input: 7520.53 toks/s, output: 3865.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:48<00:04, 117.09it/s, est. speed input: 7633.82 toks/s, output: 3961.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:49<00:04, 110.37it/s, est. speed input: 7752.47 toks/s, output: 4061.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:49<00:02, 156.80it/s, est. speed input: 8063.03 toks/s, output: 4266.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:49<00:02, 136.77it/s, est. speed input: 8211.68 toks/s, output: 4383.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:02, 136.32it/s, est. speed input: 8369.93 toks/s, output: 4493.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:49<00:03, 112.54it/s, est. speed input: 8472.11 toks/s, output: 4571.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:49<00:02, 120.30it/s, est. speed input: 8640.66 toks/s, output: 4719.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:50<00:02, 109.10it/s, est. speed input: 8781.96 toks/s, output: 4827.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:50<00:02, 123.81it/s, est. speed input: 8941.42 toks/s, output: 4964.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:50<00:02, 117.10it/s, est. speed input: 9055.07 toks/s, output: 5069.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:50<00:02, 131.82it/s, est. speed input: 9260.22 toks/s, output: 5222.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:50<00:01, 130.07it/s, est. speed input: 9372.25 toks/s, output: 5313.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:50<00:01, 133.67it/s, est. speed input: 9494.50 toks/s, output: 5406.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:50<00:01, 113.33it/s, est. speed input: 9602.21 toks/s, output: 5502.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:51<00:01, 119.24it/s, est. speed input: 9721.87 toks/s, output: 5622.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:51<00:01, 112.60it/s, est. speed input: 9839.32 toks/s, output: 5728.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:51<00:02, 86.94it/s, est. speed input: 9921.96 toks/s, output: 5817.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:51<00:01, 87.61it/s, est. speed input: 10028.38 toks/s, output: 5912.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:51<00:01, 99.05it/s, est. speed input: 10133.70 toks/s, output: 6010.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:51<00:01, 117.82it/s, est. speed input: 10289.95 toks/s, output: 6179.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:52<00:00, 128.36it/s, est. speed input: 10471.61 toks/s, output: 6353.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:52<00:00, 116.83it/s, est. speed input: 10575.42 toks/s, output: 6468.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:52<00:00, 103.27it/s, est. speed input: 10671.99 toks/s, output: 6553.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:52<00:00, 86.40it/s, est. speed input: 10754.88 toks/s, output: 6643.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:52<00:00, 81.32it/s, est. speed input: 10838.07 toks/s, output: 6752.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:53<00:00, 36.21it/s, est. speed input: 10761.77 toks/s, output: 6752.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:54<00:00, 31.01it/s, est. speed input: 10755.34 toks/s, output: 6765.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:54<00:00, 31.18it/s, est. speed input: 10778.77 toks/s, output: 6823.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:54<00:00, 28.99it/s, est. speed input: 10770.12 toks/s, output: 6847.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:06<00:00, 28.99it/s, est. speed input: 10770.12 toks/s, output: 6847.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:11<00:00, 1.62it/s, est. speed input: 8297.88 toks/s, output: 5329.92 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:11<00:00, 17.94it/s, est. speed input: 8297.88 toks/s, output: 5329.92 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.6099183559417725, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010493755107745528}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.09763999283313751, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011473210761323571}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.3420654237270355, 'actor/pg_clipfrac': 0.002209944650530815, 'actor/ppo_kl': -0.0024164358619600534}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.02825421839952469, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013771610101684928}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.4175158441066742, 'actor/pg_clipfrac': 0.0026881720405071974, 'actor/ppo_kl': -0.00010691984061850235}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1549791544675827, 'actor/pg_clipfrac': 0.005167958792299032, 'actor/ppo_kl': 0.00020008432329632342}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003992389829363674, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.98172201635316e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.21546034514904022, 'actor/pg_clipfrac': 0.001183431944809854, 'actor/ppo_kl': -0.000745179655496031}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.23257915675640106, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004098328936379403}
[36m(Runner pid=3309020)[0m Step 38
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.256
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.021
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: 7.485639713920023e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.026
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.026
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.649
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.649
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 966810
[36m(Runner pid=3309020)[0m balanced_min: 966676
[36m(Runner pid=3309020)[0m max: 974525
[36m(Runner pid=3309020)[0m mean: 966743.0
[36m(Runner pid=3309020)[0m min: 958961
[36m(Runner pid=3309020)[0m minmax_diff: 15564
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 104.637
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.119
[36m(Runner pid=3309020)[0m throughput: 1126.604
[36m(Runner pid=3309020)[0m time_per_step: 858.104
[36m(Runner pid=3309020)[0m total_num_tokens: 1933486
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 466.723
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2375.0
[36m(Runner pid=3309020)[0m mean: 288.545
[36m(Runner pid=3309020)[0m min: 52.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.3
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.649
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.113596532031235e-05
[36m(Runner pid=3309020)[0m gen: 0.152
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.291
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.138
[36m(Runner pid=3309020)[0m gen: 112.489
[36m(Runner pid=3309020)[0m old: 87.557
[36m(Runner pid=3309020)[0m ref: 87.488
[36m(Runner pid=3309020)[0m reward: 6.756
[36m(Runner pid=3309020)[0m step: 858.104
[36m(Runner pid=3309020)[0m update_actor: 563.103
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 39; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:57:05 [executor_base.py:219] It took 0.339070 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:58:32 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:57:05 [executor_base.py:219] It took 0.340407 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:58:32 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 06:58:32 [executor_base.py:208] It took 0.327214 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:58:48 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:58:48 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 06:58:48 [executor_base.py:208] It took 0.326469 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00011657556024147198, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003061752940993756}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.008905846625566483, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.7142623662948608, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.1822262406349182, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.28187504410743713, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3162018954753876, 'actor/pg_clipfrac': 0.0018382353009656072, 'actor/ppo_kl': -0.0015577884623780847}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.2979913055896759, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.06359720975160599, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.2864912450313568, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0001459420018363744, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.5859741568565369, 'actor/pg_clipfrac': 0.0006464124307967722, 'actor/ppo_kl': -0.0015483602182939649}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.10453768819570541, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.9308503866195679, 'actor/pg_clipfrac': 0.00022431583784054965, 'actor/ppo_kl': 0.0005187616334296763}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.16457955539226532, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.010403956286609173, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.7251067161560059, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0001591619657119736, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -5.967220931779593e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00028503683279268444, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003560399345587939}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.15689030289649963, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002927614841610193}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00023232850071508437, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002158572431653738}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00023079353559296578, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006516014109365642}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.1443796455860138, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009800789412111044}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.09046364575624466, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008898928645066917}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.14350681006908417, 'actor/pg_clipfrac': 0.001988071482628584, 'actor/ppo_kl': 0.00011455088679213077}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00020477038924582303, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014228542568162084}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00030464481096714735, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00014823366655036807}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.33453288674354553, 'actor/pg_clipfrac': 0.002739726100116968, 'actor/ppo_kl': 0.000909499591216445}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.09841199219226837, 'actor/pg_clipfrac': 0.0011229646624997258, 'actor/ppo_kl': -0.0006802112911827862}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.02450321987271309, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005710845580324531}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.4018106758594513, 'actor/pg_clipfrac': 0.001372683560475707, 'actor/ppo_kl': 0.0008912436896935105}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.18344508111476898, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013936922186985612}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.13649292290210724, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000689071835950017}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.44804009795188904, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007908045081421733}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.13969603180885315, 'actor/pg_clipfrac': 0.000499251123983413, 'actor/ppo_kl': 0.0003601822827477008}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.4097135365009308, 'actor/pg_clipfrac': 0.0030706243123859167, 'actor/ppo_kl': 0.00036897140671499074}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.5346862077713013, 'actor/pg_clipfrac': 0.0034762455616146326, 'actor/ppo_kl': -0.0013106446713209152}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00019978126510977745, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00024141898029483855}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00028164603281766176, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018452975200489163}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002037460362771526, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006906007183715701}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.23757193982601166, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002773926535155624}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1221860721707344, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005636930000036955}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.05244124308228493, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000504866533447057}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00031329403282143176, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016130903968587518}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2533322870731354, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000609666807577014}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0001728088391246274, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00021034128440078348}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.19944433867931366, 'actor/pg_clipfrac': 0.0009832842042669654, 'actor/ppo_kl': 0.00031375791877508163}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.08798428624868393, 'actor/pg_clipfrac': 0.0010672358330339193, 'actor/ppo_kl': -0.0006755454232916236}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.16595955193042755, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007312269299291074}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.35788261890411377, 'actor/pg_clipfrac': 0.0006738544325344265, 'actor/ppo_kl': -0.0003763003624044359}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.45367005467414856, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00043877639109268785}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:23<1:41:27, 4.77s/it, est. speed input: 97.80 toks/s, output: 21.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<48:36, 2.30s/it, est. speed input: 170.89 toks/s, output: 37.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:28<29:03, 1.38s/it, est. speed input: 242.45 toks/s, output: 55.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<18:08, 1.16it/s, est. speed input: 319.24 toks/s, output: 79.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<11:24, 1.83it/s, est. speed input: 443.64 toks/s, output: 113.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<08:29, 2.44it/s, est. speed input: 511.30 toks/s, output: 135.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:32<07:38, 2.70it/s, est. speed input: 557.87 toks/s, output: 151.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<05:36, 3.66it/s, est. speed input: 626.52 toks/s, output: 174.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:33<02:43, 7.48it/s, est. speed input: 826.75 toks/s, output: 239.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:33<01:52, 10.76it/s, est. speed input: 966.09 toks/s, output: 280.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:34<01:56, 10.22it/s, est. speed input: 1120.22 toks/s, output: 326.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:42, 11.57it/s, est. speed input: 1183.97 toks/s, output: 347.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:35<01:14, 15.82it/s, est. speed input: 1309.06 toks/s, output: 392.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:35<01:06, 17.56it/s, est. speed input: 1367.56 toks/s, output: 409.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:35<00:49, 23.69it/s, est. speed input: 1484.76 toks/s, output: 450.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:35<00:45, 25.28it/s, est. speed input: 1543.10 toks/s, output: 469.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:35<00:46, 24.84it/s, est. speed input: 1598.31 toks/s, output: 489.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:36<00:42, 26.76it/s, est. speed input: 1652.29 toks/s, output: 510.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:36<00:52, 21.84it/s, est. speed input: 1701.77 toks/s, output: 530.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:36<01:07, 16.80it/s, est. speed input: 1742.03 toks/s, output: 545.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:36<00:44, 25.21it/s, est. speed input: 1865.63 toks/s, output: 586.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:37<00:33, 33.61it/s, est. speed input: 1983.97 toks/s, output: 634.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:37<00:32, 34.19it/s, est. speed input: 2037.47 toks/s, output: 651.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:37<00:31, 34.80it/s, est. speed input: 2095.74 toks/s, output: 678.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:37<00:27, 39.63it/s, est. speed input: 2211.99 toks/s, output: 722.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:37<00:27, 38.94it/s, est. speed input: 2319.48 toks/s, output: 767.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:38<00:27, 38.60it/s, est. speed input: 2427.38 toks/s, output: 809.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:38<00:19, 53.65it/s, est. speed input: 2604.28 toks/s, output: 885.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:38<00:17, 59.07it/s, est. speed input: 2713.69 toks/s, output: 932.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:38<00:25, 41.45it/s, est. speed input: 2799.89 toks/s, output: 978.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:38<00:23, 44.97it/s, est. speed input: 2905.06 toks/s, output: 1021.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:39<00:23, 44.23it/s, est. speed input: 3002.17 toks/s, output: 1066.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:39<00:19, 51.77it/s, est. speed input: 3105.90 toks/s, output: 1126.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:39<00:20, 48.96it/s, est. speed input: 3203.84 toks/s, output: 1174.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:40<00:29, 34.13it/s, est. speed input: 3274.19 toks/s, output: 1206.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:40<00:26, 37.22it/s, est. speed input: 3371.84 toks/s, output: 1246.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:40<00:28, 33.75it/s, est. speed input: 3457.30 toks/s, output: 1291.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:40<00:23, 40.43it/s, est. speed input: 3603.50 toks/s, output: 1354.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:40<00:17, 54.64it/s, est. speed input: 3763.28 toks/s, output: 1420.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:41<00:17, 53.25it/s, est. speed input: 3854.57 toks/s, output: 1449.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:41<00:16, 56.19it/s, est. speed input: 3950.91 toks/s, output: 1501.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:41<00:15, 58.74it/s, est. speed input: 4051.48 toks/s, output: 1557.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:41<00:12, 69.71it/s, est. speed input: 4199.74 toks/s, output: 1623.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:41<00:15, 59.10it/s, est. speed input: 4277.19 toks/s, output: 1680.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:42<00:13, 64.28it/s, est. speed input: 4465.08 toks/s, output: 1762.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:42<00:13, 62.57it/s, est. speed input: 4559.79 toks/s, output: 1808.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:42<00:13, 61.47it/s, est. speed input: 4654.01 toks/s, output: 1850.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:42<00:10, 76.73it/s, est. speed input: 4861.12 toks/s, output: 1973.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:42<00:12, 63.46it/s, est. speed input: 4940.21 toks/s, output: 2023.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:43<00:08, 91.99it/s, est. speed input: 5192.95 toks/s, output: 2112.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:43<00:12, 62.44it/s, est. speed input: 5300.34 toks/s, output: 2177.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:43<00:11, 65.90it/s, est. speed input: 5398.12 toks/s, output: 2238.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:43<00:11, 64.95it/s, est. speed input: 5528.89 toks/s, output: 2309.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:43<00:11, 65.12it/s, est. speed input: 5610.34 toks/s, output: 2354.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:44<00:10, 70.22it/s, est. speed input: 5704.18 toks/s, output: 2418.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:44<00:07, 95.15it/s, est. speed input: 5901.27 toks/s, output: 2534.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:44<00:08, 78.23it/s, est. speed input: 6018.49 toks/s, output: 2594.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:44<00:10, 63.46it/s, est. speed input: 6086.73 toks/s, output: 2654.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:44<00:06, 97.39it/s, est. speed input: 6428.05 toks/s, output: 2835.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:45<00:07, 88.86it/s, est. speed input: 6553.71 toks/s, output: 2907.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:45<00:07, 80.08it/s, est. speed input: 6631.61 toks/s, output: 2965.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:45<00:08, 73.80it/s, est. speed input: 6708.13 toks/s, output: 3021.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:45<00:07, 85.42it/s, est. speed input: 6848.55 toks/s, output: 3106.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:45<00:06, 95.64it/s, est. speed input: 6984.33 toks/s, output: 3209.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:46<00:07, 71.57it/s, est. speed input: 7131.65 toks/s, output: 3316.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:46<00:06, 87.88it/s, est. speed input: 7353.21 toks/s, output: 3475.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:46<00:06, 86.97it/s, est. speed input: 7478.06 toks/s, output: 3567.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:46<00:04, 105.65it/s, est. speed input: 7660.87 toks/s, output: 3686.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:46<00:04, 114.15it/s, est. speed input: 7789.37 toks/s, output: 3772.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:46<00:03, 137.96it/s, est. speed input: 8012.42 toks/s, output: 3931.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:47<00:03, 124.75it/s, est. speed input: 8182.79 toks/s, output: 4030.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:47<00:02, 155.71it/s, est. speed input: 8459.60 toks/s, output: 4223.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:47<00:03, 112.73it/s, est. speed input: 8600.65 toks/s, output: 4348.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:47<00:03, 103.32it/s, est. speed input: 8718.36 toks/s, output: 4429.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:47<00:02, 140.71it/s, est. speed input: 9034.43 toks/s, output: 4675.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:48<00:02, 114.11it/s, est. speed input: 9171.96 toks/s, output: 4792.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:48<00:02, 117.98it/s, est. speed input: 9294.45 toks/s, output: 4902.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:48<00:02, 122.15it/s, est. speed input: 9433.72 toks/s, output: 5012.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:48<00:02, 113.12it/s, est. speed input: 9549.56 toks/s, output: 5126.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:48<00:02, 126.06it/s, est. speed input: 9762.19 toks/s, output: 5280.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:48<00:01, 157.88it/s, est. speed input: 10029.69 toks/s, output: 5497.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:48<00:01, 151.49it/s, est. speed input: 10197.39 toks/s, output: 5624.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:48<00:01, 150.19it/s, est. speed input: 10355.73 toks/s, output: 5774.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:49<00:01, 141.78it/s, est. speed input: 10519.50 toks/s, output: 5922.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:49<00:01, 105.66it/s, est. speed input: 10622.94 toks/s, output: 5998.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:49<00:01, 105.66it/s, est. speed input: 10737.28 toks/s, output: 6097.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:49<00:00, 129.91it/s, est. speed input: 10947.75 toks/s, output: 6285.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:49<00:00, 101.43it/s, est. speed input: 11028.39 toks/s, output: 6355.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:50<00:00, 91.04it/s, est. speed input: 11117.13 toks/s, output: 6465.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:50<00:00, 78.88it/s, est. speed input: 11199.89 toks/s, output: 6576.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:50<00:00, 75.38it/s, est. speed input: 11267.62 toks/s, output: 6647.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:50<00:00, 66.32it/s, est. speed input: 11310.59 toks/s, output: 6704.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:51<00:00, 47.04it/s, est. speed input: 11316.47 toks/s, output: 6745.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:51<00:00, 30.70it/s, est. speed input: 11260.52 toks/s, output: 6753.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:51<00:00, 31.78it/s, est. speed input: 11278.52 toks/s, output: 6776.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 30.65it/s, est. speed input: 11282.17 toks/s, output: 6801.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:52<00:00, 22.88it/s, est. speed input: 11233.45 toks/s, output: 6831.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 22.25it/s, est. speed input: 11220.50 toks/s, output: 6863.53 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:53<00:00, 24.05it/s, est. speed input: 11220.50 toks/s, output: 6863.53 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.7420174479484558, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -6.507153739221394e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.6490172743797302, 'actor/pg_clipfrac': 0.005242464132606983, 'actor/ppo_kl': -0.0009756650542840362}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.000381836318410933, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000775057589635253}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00016729945491533726, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012610392877832055}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002818841312546283, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003005817998200655}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.3973288834095001, 'actor/pg_clipfrac': 0.006237006280571222, 'actor/ppo_kl': -0.001993500627577305}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.19051334261894226, 'actor/pg_clipfrac': 0.004149377811700106, 'actor/ppo_kl': -0.0008031362085603178}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0003348296449985355, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015173928113654256}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3686440587043762, 'actor/pg_clipfrac': 0.0007158195949159563, 'actor/ppo_kl': -0.0015971811953932047}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00023813584994059056, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008598081767559052}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.03352198377251625, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0024718455970287323}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.22716663777828217, 'actor/pg_clipfrac': 0.000861326465383172, 'actor/ppo_kl': 0.00011484177230158821}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.06916755437850952, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012380268890410662}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002084676525555551, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016074972227215767}
[36m(Runner pid=3309020)[0m Step 39
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.248
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.024
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.016
[36m(Runner pid=3309020)[0m ppo_kl: 2.108989381701676e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.025
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.025
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.659
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.659
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 975366
[36m(Runner pid=3309020)[0m balanced_min: 973871
[36m(Runner pid=3309020)[0m max: 976834
[36m(Runner pid=3309020)[0m mean: 974618.5
[36m(Runner pid=3309020)[0m min: 972403
[36m(Runner pid=3309020)[0m minmax_diff: 4431
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.86
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.12
[36m(Runner pid=3309020)[0m throughput: 1130.888
[36m(Runner pid=3309020)[0m time_per_step: 861.817
[36m(Runner pid=3309020)[0m total_num_tokens: 1949237
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 695.0
[36m(Runner pid=3309020)[0m mean: 463.352
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3260.0
[36m(Runner pid=3309020)[0m mean: 298.069
[36m(Runner pid=3309020)[0m min: 56.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.32
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.659
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.156
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.289
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.205
[36m(Runner pid=3309020)[0m gen: 119.225
[36m(Runner pid=3309020)[0m old: 85.654
[36m(Runner pid=3309020)[0m ref: 87.425
[36m(Runner pid=3309020)[0m reward: 6.153
[36m(Runner pid=3309020)[0m step: 861.817
[36m(Runner pid=3309020)[0m update_actor: 562.565
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 40; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:11:27 [executor_base.py:219] It took 0.344996 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.76 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:12:52 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:11:27 [executor_base.py:219] It took 0.345552 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:12:52 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:12:52 [executor_base.py:208] It took 0.325971 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.84 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:12:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:12:59 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:12:59 [executor_base.py:208] It took 0.327237 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.14719153940677643, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.4001938998699188, 'actor/pg_clipfrac': 0.0007930214051157236, 'actor/ppo_kl': -0.0015118876472115517}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.7235230207443237, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -3.017456765519455e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.44653990864753723, 'actor/pg_clipfrac': 0.0021645021624863148, 'actor/ppo_kl': 0.0013113889144733548}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00026722063194029033, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.18487460911273956, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014712678967043757}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.18625760078430176, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2516343295574188, 'actor/pg_clipfrac': 0.0018348623998463154, 'actor/ppo_kl': 0.0003071811224799603}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.12755005061626434, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 8.36643812363036e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.055915724486112595, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00024219723127316684, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0003245719417463988, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.5094178915023804, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.006343526300042868, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00016274310473818332, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009675065521150827}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.4463331401348114, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00027768537984229624, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008545222226530313}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.29804590344429016, 'actor/pg_clipfrac': 0.001240694778971374, 'actor/ppo_kl': -0.0005799830541945994}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.3276127278804779, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001445178932044655}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00019920451450161636, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.998977525858209e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.4428318440914154, 'actor/pg_clipfrac': 0.0033140017185360193, 'actor/ppo_kl': -0.0010607244912534952}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.20782825350761414, 'actor/pg_clipfrac': 0.004722550045698881, 'actor/ppo_kl': 0.0008975347736850381}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00020819656492676586, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001115054008550942}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.4962083697319031, 'actor/pg_clipfrac': 0.001687763724476099, 'actor/ppo_kl': 0.0010911885183304548}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00022552398149855435, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000377134041627869}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.03137898072600365, 'actor/pg_clipfrac': 0.0014534883666783571, 'actor/ppo_kl': 0.0014236222486943007}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00019746656471397728, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00045947651960887015}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00029599404660984874, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002957467222586274}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002719130425248295, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002837276260834187}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.07630009204149246, 'actor/pg_clipfrac': 0.0009033423848450184, 'actor/ppo_kl': 0.00038469344144687057}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.24369105696678162, 'actor/pg_clipfrac': 0.005565862637013197, 'actor/ppo_kl': 0.0004346136120148003}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002927533641923219, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007482303772121668}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.3430154621601105, 'actor/pg_clipfrac': 0.0023382697254419327, 'actor/ppo_kl': 0.0014714474091306329}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.1720074862241745, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010824915952980518}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.18442730605602264, 'actor/pg_clipfrac': 0.0010060361819341779, 'actor/ppo_kl': -0.0008530779741704464}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.38483089208602905, 'actor/pg_clipfrac': 0.003913894295692444, 'actor/ppo_kl': -0.0021841623820364475}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.05592154711484909, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0020871509332209826}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.34088653326034546, 'actor/pg_clipfrac': 0.0012210012646391988, 'actor/ppo_kl': 0.000297700084047392}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2615869343280792, 'actor/pg_clipfrac': 0.002463054144755006, 'actor/ppo_kl': 6.481227319454774e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00016139665967784822, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00015560000611003488}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.7490741014480591, 'actor/pg_clipfrac': 0.0009345794678665698, 'actor/ppo_kl': -0.00021209270926192403}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.040654733777046204, 'actor/pg_clipfrac': 0.0019011406693607569, 'actor/ppo_kl': 0.00015310643357224762}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.047365397214889526, 'actor/pg_clipfrac': 0.001172332908026874, 'actor/ppo_kl': 0.0007259932463057339}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2605783939361572, 'actor/pg_clipfrac': 0.0009601536439731717, 'actor/ppo_kl': -0.0007040753844194114}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:35:04, 15.17s/it, est. speed input: 30.78 toks/s, output: 5.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:15<26:02, 4.18s/it, est. speed input: 88.31 toks/s, output: 16.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%|▏ | 5/377 [00:16<12:54, 2.08s/it, est. speed input: 146.92 toks/s, output: 29.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 7/377 [00:16<07:36, 1.23s/it, est. speed input: 202.48 toks/s, output: 42.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 9/377 [00:16<04:54, 1.25it/s, est. speed input: 256.25 toks/s, output: 56.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 11/377 [00:16<03:18, 1.84it/s, est. speed input: 309.93 toks/s, output: 70.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 13/377 [00:16<02:21, 2.56it/s, est. speed input: 363.86 toks/s, output: 83.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 16/377 [00:16<01:31, 3.93it/s, est. speed input: 441.33 toks/s, output: 104.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 19/377 [00:17<01:07, 5.32it/s, est. speed input: 516.68 toks/s, output: 127.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 22/377 [00:17<00:53, 6.70it/s, est. speed input: 589.71 toks/s, output: 151.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 28/377 [00:17<00:29, 11.67it/s, est. speed input: 743.64 toks/s, output: 200.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 33/377 [00:17<00:22, 15.24it/s, est. speed input: 868.15 toks/s, output: 241.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 36/377 [00:17<00:20, 16.67it/s, est. speed input: 939.77 toks/s, output: 265.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 39/377 [00:17<00:18, 18.03it/s, est. speed input: 1013.12 toks/s, output: 289.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 43/377 [00:17<00:15, 21.01it/s, est. speed input: 1108.01 toks/s, output: 324.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 51/377 [00:18<00:10, 31.15it/s, est. speed input: 1305.28 toks/s, output: 395.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 55/377 [00:18<00:12, 25.19it/s, est. speed input: 1385.50 toks/s, output: 427.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 65/377 [00:18<00:08, 37.62it/s, est. speed input: 1634.84 toks/s, output: 520.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 73/377 [00:18<00:07, 41.90it/s, est. speed input: 1821.99 toks/s, output: 595.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 79/377 [00:18<00:06, 43.91it/s, est. speed input: 1958.80 toks/s, output: 651.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 84/377 [00:18<00:06, 43.43it/s, est. speed input: 2070.90 toks/s, output: 697.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 90/377 [00:18<00:06, 45.48it/s, est. speed input: 2207.69 toks/s, output: 755.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 95/377 [00:19<00:07, 35.32it/s, est. speed input: 2301.59 toks/s, output: 798.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 104/377 [00:19<00:06, 45.19it/s, est. speed input: 2505.08 toks/s, output: 889.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 112/377 [00:19<00:05, 51.47it/s, est. speed input: 2680.97 toks/s, output: 972.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 118/377 [00:19<00:05, 48.97it/s, est. speed input: 2802.85 toks/s, output: 1032.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 125/377 [00:19<00:04, 53.26it/s, est. speed input: 2954.39 toks/s, output: 1106.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 137/377 [00:19<00:03, 66.50it/s, est. speed input: 3216.37 toks/s, output: 1234.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 146/377 [00:19<00:03, 69.46it/s, est. speed input: 3413.34 toks/s, output: 1329.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 154/377 [00:20<00:03, 62.17it/s, est. speed input: 3570.58 toks/s, output: 1410.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 161/377 [00:20<00:03, 58.62it/s, est. speed input: 3710.68 toks/s, output: 1482.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 174/377 [00:20<00:02, 73.45it/s, est. speed input: 3995.14 toks/s, output: 1631.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 186/377 [00:20<00:02, 84.01it/s, est. speed input: 4253.83 toks/s, output: 1769.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 195/377 [00:20<00:02, 85.54it/s, est. speed input: 4441.68 toks/s, output: 1873.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 204/377 [00:20<00:02, 78.96it/s, est. speed input: 4624.19 toks/s, output: 1974.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 214/377 [00:20<00:02, 80.82it/s, est. speed input: 4822.25 toks/s, output: 2094.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 224/377 [00:20<00:01, 78.45it/s, est. speed input: 5016.04 toks/s, output: 2211.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 233/377 [00:21<00:02, 71.89it/s, est. speed input: 5185.38 toks/s, output: 2316.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 241/377 [00:21<00:01, 68.76it/s, est. speed input: 5329.77 toks/s, output: 2412.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 249/377 [00:21<00:01, 66.87it/s, est. speed input: 5476.18 toks/s, output: 2509.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 256/377 [00:21<00:01, 61.22it/s, est. speed input: 5595.81 toks/s, output: 2591.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 263/377 [00:21<00:02, 55.74it/s, est. speed input: 5709.72 toks/s, output: 2676.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 271/377 [00:21<00:01, 61.00it/s, est. speed input: 5857.59 toks/s, output: 2782.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▎ | 278/377 [00:21<00:01, 57.84it/s, est. speed input: 5970.01 toks/s, output: 2871.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 286/377 [00:22<00:01, 58.24it/s, est. speed input: 6104.56 toks/s, output: 2977.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 292/377 [00:22<00:01, 54.51it/s, est. speed input: 6198.05 toks/s, output: 3054.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 304/377 [00:22<00:01, 65.59it/s, est. speed input: 6423.98 toks/s, output: 3230.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 311/377 [00:22<00:01, 60.99it/s, est. speed input: 6538.33 toks/s, output: 3326.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 319/377 [00:22<00:00, 64.19it/s, est. speed input: 6678.11 toks/s, output: 3444.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 326/377 [00:22<00:00, 54.58it/s, est. speed input: 6780.18 toks/s, output: 3540.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 332/377 [00:22<00:00, 49.12it/s, est. speed input: 6855.68 toks/s, output: 3624.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 339/377 [00:22<00:00, 53.30it/s, est. speed input: 6967.49 toks/s, output: 3736.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:23<00:00, 34.24it/s, est. speed input: 6987.50 toks/s, output: 3796.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 350/377 [00:23<00:00, 30.04it/s, est. speed input: 7020.47 toks/s, output: 3860.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 354/377 [00:23<00:00, 27.63it/s, est. speed input: 7046.52 toks/s, output: 3914.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 358/377 [00:24<00:01, 18.80it/s, est. speed input: 6998.66 toks/s, output: 3930.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:24<00:00, 16.23it/s, est. speed input: 6974.69 toks/s, output: 3955.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 364/377 [00:24<00:00, 14.85it/s, est. speed input: 6960.03 toks/s, output: 3987.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [00:25<00:01, 9.64it/s, est. speed input: 6846.83 toks/s, output: 3954.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [00:25<00:00, 9.29it/s, est. speed input: 6818.03 toks/s, output: 3970.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [00:25<00:00, 8.02it/s, est. speed input: 6756.70 toks/s, output: 3973.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▊| 372/377 [00:26<00:00, 6.32it/s, est. speed input: 6658.58 toks/s, output: 3955.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:26<00:00, 5.17it/s, est. speed input: 6577.60 toks/s, output: 3929.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [00:49<00:12, 4.08s/it, est. speed input: 3578.93 toks/s, output: 2207.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 375/377 [01:00<00:10, 5.43s/it, est. speed input: 2941.57 toks/s, output: 1893.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 376/377 [01:03<00:05, 5.01s/it, est. speed input: 2785.50 toks/s, output: 1874.09 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:03<00:00, 5.91it/s, est. speed input: 2791.66 toks/s, output: 1959.50 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.3496911823749542, 'actor/pg_clipfrac': 0.0030674845911562443, 'actor/ppo_kl': -0.0004794202686753124}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2834090292453766, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00040189249557442963}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.06545525044202805, 'actor/pg_clipfrac': 0.002200704300776124, 'actor/ppo_kl': -0.0005100478301756084}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.3087370991706848, 'actor/pg_clipfrac': 0.0027272726874798536, 'actor/ppo_kl': 0.00042680741171352565}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0004025548987556249, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008970115450210869}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00021145978826098144, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013859083410352468}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.5129819512367249, 'actor/pg_clipfrac': 0.003201707499101758, 'actor/ppo_kl': 0.0008602630696259439}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.5080633759498596, 'actor/pg_clipfrac': 0.0009813542710617185, 'actor/ppo_kl': 0.00225773174315691}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3802652359008789, 'actor/pg_clipfrac': 0.003325020894408226, 'actor/ppo_kl': 0.0006577134481631219}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4049787223339081, 'actor/pg_clipfrac': 0.0008110299822874367, 'actor/ppo_kl': 0.000895870674867183}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.14320628345012665, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003691969031933695}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.11761773377656937, 'actor/pg_clipfrac': 0.002998500829562545, 'actor/ppo_kl': -0.0019322907319292426}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.35501179099082947, 'actor/pg_clipfrac': 0.0006234414177015424, 'actor/ppo_kl': -0.0008068429306149483}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.17124630510807037, 'actor/pg_clipfrac': 0.0008741258643567562, 'actor/ppo_kl': 0.0017220006557181478}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3317926526069641, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003885856131091714}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.07708970457315445, 'actor/pg_clipfrac': 0.0008576329564675689, 'actor/ppo_kl': 0.0005620626034215093}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.23842430114746094, 'actor/pg_clipfrac': 0.00041356493602506816, 'actor/ppo_kl': 0.0005356455221772194}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0002495397056918591, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008298300090245903}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0003043109318241477, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015311797615140676}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.08539783209562302, 'actor/pg_clipfrac': 0.002560819499194622, 'actor/ppo_kl': 0.0003828031476587057}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.04 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:26:10 [executor_base.py:219] It took 0.339413 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.95 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.66 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:26:10 [executor_base.py:219] It took 0.338885 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:27:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:27:58 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:27:58 [executor_base.py:208] It took 0.328185 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:27:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:27:59 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:27:59 [executor_base.py:208] It took 0.327990 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to understand the geometric configuration and the properties of the triangle. Since D is the midpoint of AB, and BD = 2.0, it follows that AD = 2.0 as well. This means that D is the midpoint of AB, and thus, the line segment AD is half the length of AB.\n\nThe perimeter of triangle ABC is given as 16.0. Since D is the midpoint of AB, the length of AB is twice the length of AD, which is 4.0. Therefore, the perimeter of triangle ABC can be expressed as AB + BC + AC = 4.0 + BC + AC = 16.0. This simplifies to BC + AC = 12.0.\n\nNow, we need to find the minimum perimeter of triangle AEC. Since E is any point on the line l, which is the perpendicular bisector of AB, the minimum perimeter of triangle AEC occurs when E is the foot of the perpendicular from A to line l. This means that AE is the shortest possible distance from A to line l, which is the same as the distance from A to D (since D is the midpoint of AB and l is perpendicular to AB).\n\nThe perimeter of triangle AEC is then AE + EC + AC. Since AE is the distance from A to D, and D is the midpoint of AB, AE = AD = 2.0. EC is the same as EC because E is on the same horizontal line as C. Therefore, the perimeter of triangle AEC is 2.0 + EC + AC.\n\nSince AC is part of the perimeter of triangle ABC and BC + AC = 12.0, the minimum value of EC is when E is at the point on line l that is directly below C, making EC = 0 (if we consider the straight-line distance from A to C through E). Thus, the minimum perimeter of triangle AEC is 2.0 + 0 + AC = AC = 12.0 (since BC + AC = 12.0).\n\nTherefore, the minimum perimeter of triangle AEC is 12.0.\n\n\n12.0\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nTo solve this problem, we need to understand the geometric properties and relationships in the given figure. Since triangle ABC is rotated clockwise by 90 degrees around point A to form triangle AB'C', we can deduce that angle BAC = angle B'A'C' = 90 degrees. This implies that triangle AB'C' is also a right triangle with angle B'A'C' = 90 degrees.\n\nGiven that angle CC'B' = 32 degrees, we can infer that angle AC'B' = 90 - 32 = 58 degrees, because the angles in triangle AC'B' must sum up to 180 degrees.\n\nNow, let's connect the points. Since triangle AB'C' is formed by rotating triangle ABC, angle AC'B' is the same as the angle that was originally at angle ACB before the rotation. Therefore, angle AC'B' = angle ACB.\n\nThus, angle AC'B' = 58 degrees.\n\n\nThe size of angle AC'B' is 58 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that A, B, and D are on circle O and C is on the diameter BE, we can use properties of circles and angles to find the measure of angle AEB. Since C lies on the diameter BE, angle BAE is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Given that angle ADC = 54 degrees, and since AD is a chord and angle ADC is at point D, angle ADB is also 54 degrees (since opposite angles in a cyclic quadrilateral are supplementary). Therefore, angle ABE = 90 degrees - 54 degrees = 36 degrees. Since angle AEB is part of triangle ABE and we know angle ABE and angle BAE, we can find angle AEB using the fact that the sum of angles in a triangle is 180 degrees. Thus, angle AEB = 180 degrees - 90 degrees - 36 degrees = 54 degrees.\n\n\n54 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo solve this problem, we need to understand the properties of a circle and the relationships between angles formed by tangents and chords.\n\n1. Since AB is the diameter of circle O, angle ADB is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle.\n2. CD is tangent to the circle at point D, which means that angle ODC is also a right angle (90 degrees) because a tangent to a circle is perpendicular to the radius at the point of tangency.\n3. Since angle ADB is 90 degrees and angle A is 35 degrees, we can find angle BDA by subtracting angle A from 90 degrees.\n4. angle BDA = 90 degrees - 35 degrees = 55 degrees.\n5. Since angle BDA is the same as angle BDC (as they are both angles formed by the same chord BD with the tangent CD), angle BDC is also 55 degrees.\n6. Now, in triangle BDC, we have angle BDC = 55 degrees and angle DBC = 90 degrees (since BD is the radius and the tangent is perpendicular to the radius at the point of tangency).\n7. To find angle C, we can use the fact that the sum of the angles in a triangle is 180 degrees.\n8. Therefore, angle C = 180 degrees - angle BDC - angle DBC = 180 degrees - 55 degrees - 90 degrees = 35 degrees.\n\n\n35 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:17<1:13:08, 3.44s/it, est. speed input: 133.06 toks/s, output: 23.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:21<39:46, 1.88s/it, est. speed input: 220.48 toks/s, output: 45.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<29:07, 1.38s/it, est. speed input: 280.08 toks/s, output: 66.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:26<20:08, 1.04it/s, est. speed input: 346.67 toks/s, output: 86.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:27<14:37, 1.43it/s, est. speed input: 417.68 toks/s, output: 108.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:28<10:31, 1.98it/s, est. speed input: 493.18 toks/s, output: 129.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<09:44, 2.13it/s, est. speed input: 537.06 toks/s, output: 142.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<08:01, 2.57it/s, est. speed input: 593.75 toks/s, output: 159.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<06:47, 3.03it/s, est. speed input: 646.00 toks/s, output: 178.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<06:36, 3.10it/s, est. speed input: 683.59 toks/s, output: 194.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<05:39, 3.61it/s, est. speed input: 736.46 toks/s, output: 214.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<03:52, 5.22it/s, est. speed input: 846.79 toks/s, output: 252.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<03:12, 6.29it/s, est. speed input: 903.47 toks/s, output: 277.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:35, 7.74it/s, est. speed input: 963.32 toks/s, output: 295.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:36<02:02, 9.79it/s, est. speed input: 1022.65 toks/s, output: 315.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:36<01:42, 11.61it/s, est. speed input: 1079.70 toks/s, output: 339.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<01:33, 12.69it/s, est. speed input: 1133.47 toks/s, output: 363.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:37<01:21, 14.49it/s, est. speed input: 1186.11 toks/s, output: 386.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<01:13, 16.13it/s, est. speed input: 1240.54 toks/s, output: 405.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:37<00:51, 22.93it/s, est. speed input: 1354.87 toks/s, output: 452.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:37<00:37, 31.19it/s, est. speed input: 1474.70 toks/s, output: 504.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:38, 29.96it/s, est. speed input: 1580.54 toks/s, output: 545.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:41, 27.56it/s, est. speed input: 1685.96 toks/s, output: 586.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:28, 40.13it/s, est. speed input: 1857.81 toks/s, output: 659.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:28, 39.13it/s, est. speed input: 1966.06 toks/s, output: 704.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:39<00:33, 32.97it/s, est. speed input: 2012.75 toks/s, output: 725.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:39<00:32, 33.73it/s, est. speed input: 2063.78 toks/s, output: 744.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:35, 31.34it/s, est. speed input: 2111.20 toks/s, output: 765.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:32, 33.55it/s, est. speed input: 2214.51 toks/s, output: 812.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:34, 31.81it/s, est. speed input: 2258.26 toks/s, output: 838.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:28, 38.02it/s, est. speed input: 2366.27 toks/s, output: 886.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:22, 47.19it/s, est. speed input: 2477.71 toks/s, output: 942.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:40<00:23, 45.36it/s, est. speed input: 2576.96 toks/s, output: 986.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:40<00:19, 53.35it/s, est. speed input: 2685.07 toks/s, output: 1034.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:41<00:24, 42.02it/s, est. speed input: 2775.18 toks/s, output: 1080.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:41<00:23, 42.59it/s, est. speed input: 2915.91 toks/s, output: 1145.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:42<00:38, 26.32it/s, est. speed input: 2933.76 toks/s, output: 1157.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:42<00:35, 28.47it/s, est. speed input: 2985.39 toks/s, output: 1181.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:42<00:32, 30.84it/s, est. speed input: 3030.13 toks/s, output: 1207.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:42<00:29, 33.36it/s, est. speed input: 3076.13 toks/s, output: 1235.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:42<00:30, 32.83it/s, est. speed input: 3118.52 toks/s, output: 1261.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:42<00:26, 37.65it/s, est. speed input: 3209.34 toks/s, output: 1301.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:16, 57.72it/s, est. speed input: 3358.45 toks/s, output: 1383.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:43<00:17, 54.56it/s, est. speed input: 3467.80 toks/s, output: 1443.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:43<00:19, 48.97it/s, est. speed input: 3550.97 toks/s, output: 1489.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:43<00:19, 49.37it/s, est. speed input: 3641.84 toks/s, output: 1523.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:43<00:18, 50.19it/s, est. speed input: 3828.63 toks/s, output: 1599.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:44<00:16, 53.89it/s, est. speed input: 3923.91 toks/s, output: 1643.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:44<00:17, 50.57it/s, est. speed input: 4010.06 toks/s, output: 1689.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:44<00:19, 46.46it/s, est. speed input: 4089.43 toks/s, output: 1728.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:44<00:16, 52.37it/s, est. speed input: 4177.59 toks/s, output: 1770.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:44<00:17, 50.57it/s, est. speed input: 4261.98 toks/s, output: 1812.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:45<00:15, 56.49it/s, est. speed input: 4354.88 toks/s, output: 1849.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:45<00:15, 53.65it/s, est. speed input: 4431.50 toks/s, output: 1898.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:45<00:16, 51.89it/s, est. speed input: 4513.69 toks/s, output: 1948.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:45<00:15, 54.56it/s, est. speed input: 4598.99 toks/s, output: 2003.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:45<00:15, 54.23it/s, est. speed input: 4721.31 toks/s, output: 2088.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:46<00:15, 51.79it/s, est. speed input: 4838.80 toks/s, output: 2156.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:46<00:11, 65.65it/s, est. speed input: 4985.04 toks/s, output: 2236.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:46<00:10, 75.20it/s, est. speed input: 5122.68 toks/s, output: 2320.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:46<00:09, 82.57it/s, est. speed input: 5297.65 toks/s, output: 2421.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:46<00:10, 72.86it/s, est. speed input: 5371.43 toks/s, output: 2472.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:47<00:08, 81.93it/s, est. speed input: 5548.99 toks/s, output: 2583.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:47<00:06, 102.08it/s, est. speed input: 5779.33 toks/s, output: 2713.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:47<00:06, 106.47it/s, est. speed input: 5912.44 toks/s, output: 2799.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:47<00:07, 85.02it/s, est. speed input: 6018.37 toks/s, output: 2885.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:47<00:08, 77.71it/s, est. speed input: 6139.43 toks/s, output: 2941.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:47<00:08, 72.33it/s, est. speed input: 6210.15 toks/s, output: 2993.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:48<00:07, 83.83it/s, est. speed input: 6345.33 toks/s, output: 3079.65 toks/s]
Processed prompts: 53%|█████▎ | 675/1280 [00:48<00:05, 106.67it/s, est. speed input: 6515.76 toks/s, output: 3194.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:48<00:06, 87.97it/s, est. speed input: 6624.40 toks/s, output: 3254.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:48<00:06, 89.54it/s, est. speed input: 6782.07 toks/s, output: 3377.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:48<00:06, 90.38it/s, est. speed input: 6910.02 toks/s, output: 3458.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:49<00:06, 85.96it/s, est. speed input: 7024.92 toks/s, output: 3542.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:49<00:05, 96.21it/s, est. speed input: 7153.35 toks/s, output: 3623.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:49<00:05, 98.17it/s, est. speed input: 7283.45 toks/s, output: 3704.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:49<00:04, 104.12it/s, est. speed input: 7407.00 toks/s, output: 3794.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:49<00:05, 85.86it/s, est. speed input: 7517.55 toks/s, output: 3881.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:49<00:05, 79.28it/s, est. speed input: 7630.62 toks/s, output: 3958.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:50<00:05, 83.21it/s, est. speed input: 7749.74 toks/s, output: 4042.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:50<00:04, 92.46it/s, est. speed input: 7872.69 toks/s, output: 4141.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:50<00:04, 97.30it/s, est. speed input: 7991.71 toks/s, output: 4221.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:50<00:04, 94.77it/s, est. speed input: 8098.40 toks/s, output: 4310.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:50<00:03, 122.50it/s, est. speed input: 8352.07 toks/s, output: 4502.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:50<00:03, 111.98it/s, est. speed input: 8464.88 toks/s, output: 4589.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:50<00:02, 116.21it/s, est. speed input: 8584.11 toks/s, output: 4693.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:51<00:01, 175.07it/s, est. speed input: 8973.99 toks/s, output: 5020.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:51<00:01, 149.82it/s, est. speed input: 9123.54 toks/s, output: 5158.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:51<00:02, 118.01it/s, est. speed input: 9255.35 toks/s, output: 5284.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:51<00:01, 141.51it/s, est. speed input: 9469.87 toks/s, output: 5486.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:51<00:01, 159.92it/s, est. speed input: 9671.54 toks/s, output: 5695.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:51<00:01, 156.94it/s, est. speed input: 9832.74 toks/s, output: 5850.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:52<00:01, 116.46it/s, est. speed input: 9948.91 toks/s, output: 5987.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:52<00:01, 107.31it/s, est. speed input: 10061.04 toks/s, output: 6093.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:52<00:01, 96.49it/s, est. speed input: 10155.57 toks/s, output: 6196.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:52<00:01, 107.79it/s, est. speed input: 10303.85 toks/s, output: 6346.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:52<00:01, 99.69it/s, est. speed input: 10397.57 toks/s, output: 6476.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:53<00:00, 93.54it/s, est. speed input: 10492.69 toks/s, output: 6580.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:53<00:01, 68.34it/s, est. speed input: 10576.05 toks/s, output: 6688.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:53<00:00, 66.70it/s, est. speed input: 10629.30 toks/s, output: 6758.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:53<00:00, 54.26it/s, est. speed input: 10656.57 toks/s, output: 6804.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:54<00:00, 58.69it/s, est. speed input: 10723.93 toks/s, output: 6867.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:54<00:00, 47.48it/s, est. speed input: 10741.21 toks/s, output: 6945.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:55<00:00, 30.69it/s, est. speed input: 10699.38 toks/s, output: 6951.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:55<00:00, 32.67it/s, est. speed input: 10721.62 toks/s, output: 6995.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:56<00:00, 14.99it/s, est. speed input: 10544.81 toks/s, output: 6908.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:56<00:00, 14.46it/s, est. speed input: 10514.84 toks/s, output: 6903.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.56it/s, est. speed input: 10544.16 toks/s, output: 6943.50 toks/s]
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, angle ABD is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Since EA is tangent to the circle at point A, angle EAB is also 90 degrees.\n\nGiven angle EAC = 120 degrees, we can find angle BAC by subtracting angle EAB from angle EAC:\nangle BAC = angle EAC - angle EAB = 120 degrees - 90 degrees = 30 degrees.\n\nSince angle ABC is an inscribed angle that intercepts the same arc as angle BAC (which is subtended by the same arc AC), it follows that angle ABC is half of angle BAC because the angle at the circumference is half the angle at the center that subtends the same arc:\nangle ABC = 1/2 * angle BAC = 1/2 * 30 degrees = 15 degrees.\n\nTherefore, the degree of angle ABC is 15 degrees.\n\n\n15 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_25
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_40/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_40/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_40/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 40
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.249
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.017
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.008
[36m(Runner pid=3309020)[0m ppo_kl: 8.586060588982037e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.016
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.016
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.665
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.665
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 970730
[36m(Runner pid=3309020)[0m balanced_min: 970648
[36m(Runner pid=3309020)[0m max: 978849
[36m(Runner pid=3309020)[0m mean: 970689.0
[36m(Runner pid=3309020)[0m min: 962529
[36m(Runner pid=3309020)[0m minmax_diff: 16320
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.971
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.119
[36m(Runner pid=3309020)[0m throughput: 933.909
[36m(Runner pid=3309020)[0m time_per_step: 1039.382
[36m(Runner pid=3309020)[0m total_num_tokens: 1941378
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 695.0
[36m(Runner pid=3309020)[0m mean: 465.82
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1827.0
[36m(Runner pid=3309020)[0m mean: 292.53
[36m(Runner pid=3309020)[0m min: 51.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.332
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.665
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.354151328518666e-05
[36m(Runner pid=3309020)[0m gen: 0.14
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.29
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.182
[36m(Runner pid=3309020)[0m gen: 104.784
[36m(Runner pid=3309020)[0m old: 85.902
[36m(Runner pid=3309020)[0m ref: 87.039
[36m(Runner pid=3309020)[0m reward: 6.428
[36m(Runner pid=3309020)[0m save_checkpoint: 29.116
[36m(Runner pid=3309020)[0m step: 1039.382
[36m(Runner pid=3309020)[0m update_actor: 562.525
[36m(Runner pid=3309020)[0m validation: 162.827
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.358
[36m(Runner pid=3309020)[0m format_reward: 0.989
[36m(Runner pid=3309020)[0m overall_reward: 0.674
[36m(Runner pid=3309020)[0m reward_score: 0.674
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.992
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 41; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_40/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_40/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_40/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:28:48 [executor_base.py:219] It took 0.343563 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:30:12 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:28:48 [executor_base.py:219] It took 0.347080 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:30:12 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:30:12 [executor_base.py:208] It took 0.327940 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.82 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:30:16 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:30:16 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:30:16 [executor_base.py:208] It took 0.325418 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0001910191058414057, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005399824003688991}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00029135154909454286, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.09678742289543152, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.09556911885738373, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.31873130798339844, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00019757366681005806, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.011355550028383732, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002608250651974231, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00015210382116492838, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.1712716519832611, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007731560035608709}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3430343568325043, 'actor/pg_clipfrac': 0.0018903592135757208, 'actor/ppo_kl': 0.0010067339753732085}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.4474773108959198, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006153791327960789}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.4485871493816376, 'actor/pg_clipfrac': 0.0004906771355308592, 'actor/ppo_kl': -0.0017863985849544406}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.13659295439720154, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.17368951439857483, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00078582763671875}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.10533444583415985, 'actor/pg_clipfrac': 0.0028248587623238564, 'actor/ppo_kl': -0.0013692015781998634}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.2924327254295349, 'actor/pg_clipfrac': 0.0019455252913758159, 'actor/ppo_kl': 0.002558042062446475}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.13366387784481049, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004347010690253228}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.38697201013565063, 'actor/pg_clipfrac': 0.003132341429591179, 'actor/ppo_kl': -0.00019435389549471438}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.39092355966567993, 'actor/pg_clipfrac': 0.001168907037936151, 'actor/ppo_kl': 0.0009524280321784317}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.4377710521221161, 'actor/pg_clipfrac': 0.0009107468067668378, 'actor/ppo_kl': 0.00022739834093954414}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.33658379316329956, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0025841612368822098}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00047343297046609223, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009131722617894411}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.26832783222198486, 'actor/pg_clipfrac': 0.0021482277661561966, 'actor/ppo_kl': -0.0004627707239706069}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.25284770131111145, 'actor/pg_clipfrac': 0.001752848387695849, 'actor/ppo_kl': -0.0019343373132869601}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.04264381155371666, 'actor/pg_clipfrac': 0.0006468305364251137, 'actor/ppo_kl': -0.00011386229743948206}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.30040138959884644, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007162558031268418}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.1635291874408722, 'actor/pg_clipfrac': 0.0010928962146863341, 'actor/ppo_kl': -0.0008252211846411228}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.08263933658599854, 'actor/pg_clipfrac': 0.0018665422685444355, 'actor/ppo_kl': 0.0005372866871766746}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.20731136202812195, 'actor/pg_clipfrac': 0.0016198704252019525, 'actor/ppo_kl': -0.0010702027939260006}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.26692286133766174, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005907066515646875}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3490680158138275, 'actor/pg_clipfrac': 0.0029069767333567142, 'actor/ppo_kl': 0.0008716777083463967}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.4387173056602478, 'actor/pg_clipfrac': 0.0014652014942839742, 'actor/ppo_kl': -0.000804377137683332}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.4130171239376068, 'actor/pg_clipfrac': 0.00327332247979939, 'actor/ppo_kl': -0.0004608135495800525}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00025133462622761726, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00041596920345909894}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.14171797037124634, 'actor/pg_clipfrac': 0.0017969452310353518, 'actor/ppo_kl': 0.001802283339202404}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.35675832629203796, 'actor/pg_clipfrac': 0.004098360426723957, 'actor/ppo_kl': -0.0009493801626376808}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.061909642070531845, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000923973333556205}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00024769414449110627, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.8782060578814708e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00045204407069832087, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013615989591926336}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3712841272354126, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004550618177745491}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002677260199561715, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0023817066103219986}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00014920365356374532, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008292594575323164}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.16207943856716156, 'actor/pg_clipfrac': 0.0007849293760955334, 'actor/ppo_kl': -0.0009209559648297727}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:17<1:16:11, 3.59s/it, est. speed input: 129.96 toks/s, output: 27.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<52:49, 2.50s/it, est. speed input: 169.79 toks/s, output: 45.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<20:37, 1.02it/s, est. speed input: 335.55 toks/s, output: 92.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:27<14:57, 1.40it/s, est. speed input: 415.75 toks/s, output: 115.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:29<12:49, 1.63it/s, est. speed input: 466.99 toks/s, output: 133.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<10:40, 1.94it/s, est. speed input: 516.52 toks/s, output: 150.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<07:09, 2.88it/s, est. speed input: 632.30 toks/s, output: 184.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<05:36, 3.66it/s, est. speed input: 701.39 toks/s, output: 207.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:33<02:53, 7.00it/s, est. speed input: 901.76 toks/s, output: 271.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:34<02:56, 6.87it/s, est. speed input: 946.01 toks/s, output: 287.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:34<02:29, 8.09it/s, est. speed input: 999.83 toks/s, output: 311.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:35<02:49, 7.09it/s, est. speed input: 1030.19 toks/s, output: 324.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:46, 11.12it/s, est. speed input: 1154.81 toks/s, output: 370.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:35<01:19, 14.75it/s, est. speed input: 1273.88 toks/s, output: 418.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:36<01:17, 15.23it/s, est. speed input: 1329.19 toks/s, output: 438.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<01:14, 15.71it/s, est. speed input: 1384.99 toks/s, output: 461.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:36<00:54, 21.30it/s, est. speed input: 1511.33 toks/s, output: 507.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:36<00:43, 26.61it/s, est. speed input: 1628.25 toks/s, output: 558.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:40, 28.08it/s, est. speed input: 1685.65 toks/s, output: 580.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:37<00:38, 29.58it/s, est. speed input: 1738.79 toks/s, output: 603.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:37<00:35, 32.06it/s, est. speed input: 1848.01 toks/s, output: 645.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:37<00:35, 31.25it/s, est. speed input: 1949.89 toks/s, output: 683.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:38<00:46, 23.97it/s, est. speed input: 1989.77 toks/s, output: 707.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:38<00:34, 32.13it/s, est. speed input: 2102.38 toks/s, output: 756.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:38<00:22, 47.69it/s, est. speed input: 2271.11 toks/s, output: 833.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:38<00:21, 49.80it/s, est. speed input: 2376.50 toks/s, output: 885.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:38<00:18, 56.60it/s, est. speed input: 2488.10 toks/s, output: 928.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:39<00:26, 40.53it/s, est. speed input: 2578.74 toks/s, output: 965.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:39<00:23, 44.52it/s, est. speed input: 2686.63 toks/s, output: 1019.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:39<00:23, 44.32it/s, est. speed input: 2790.55 toks/s, output: 1066.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:39<00:26, 38.59it/s, est. speed input: 2879.72 toks/s, output: 1107.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:40<00:27, 36.67it/s, est. speed input: 2924.78 toks/s, output: 1130.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:40<00:22, 45.83it/s, est. speed input: 3034.23 toks/s, output: 1183.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:40<00:18, 54.07it/s, est. speed input: 3190.30 toks/s, output: 1265.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:40<00:16, 61.83it/s, est. speed input: 3291.66 toks/s, output: 1323.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:40<00:19, 49.14it/s, est. speed input: 3381.78 toks/s, output: 1372.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:41<00:19, 49.02it/s, est. speed input: 3479.10 toks/s, output: 1429.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:41<00:18, 52.89it/s, est. speed input: 3574.98 toks/s, output: 1470.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:41<00:18, 51.95it/s, est. speed input: 3673.81 toks/s, output: 1520.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:41<00:13, 69.61it/s, est. speed input: 3842.45 toks/s, output: 1603.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:41<00:13, 66.09it/s, est. speed input: 3985.42 toks/s, output: 1683.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:42<00:17, 50.44it/s, est. speed input: 4113.16 toks/s, output: 1743.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:42<00:16, 54.03it/s, est. speed input: 4212.06 toks/s, output: 1792.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:42<00:12, 72.57it/s, est. speed input: 4416.09 toks/s, output: 1905.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:42<00:09, 89.97it/s, est. speed input: 4622.15 toks/s, output: 2025.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:42<00:10, 77.80it/s, est. speed input: 4760.39 toks/s, output: 2091.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:42<00:09, 86.31it/s, est. speed input: 4910.37 toks/s, output: 2167.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:43<00:08, 94.04it/s, est. speed input: 5063.30 toks/s, output: 2245.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:43<00:12, 63.94it/s, est. speed input: 5175.40 toks/s, output: 2304.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:43<00:17, 45.66it/s, est. speed input: 5225.92 toks/s, output: 2331.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:44<00:15, 48.81it/s, est. speed input: 5319.39 toks/s, output: 2386.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:44<00:14, 54.57it/s, est. speed input: 5405.09 toks/s, output: 2448.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:44<00:12, 60.41it/s, est. speed input: 5497.68 toks/s, output: 2488.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:44<00:11, 66.93it/s, est. speed input: 5593.25 toks/s, output: 2543.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:44<00:08, 87.42it/s, est. speed input: 5788.40 toks/s, output: 2654.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:44<00:07, 100.18it/s, est. speed input: 5932.48 toks/s, output: 2740.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:44<00:06, 111.51it/s, est. speed input: 6072.71 toks/s, output: 2830.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:44<00:06, 112.02it/s, est. speed input: 6209.09 toks/s, output: 2899.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:45<00:06, 105.33it/s, est. speed input: 6344.25 toks/s, output: 2966.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:45<00:06, 96.05it/s, est. speed input: 6468.20 toks/s, output: 3023.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:45<00:06, 102.57it/s, est. speed input: 6609.93 toks/s, output: 3091.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:45<00:08, 74.12it/s, est. speed input: 6708.94 toks/s, output: 3153.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:45<00:07, 78.01it/s, est. speed input: 6875.36 toks/s, output: 3255.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:46<00:07, 79.65it/s, est. speed input: 6954.30 toks/s, output: 3316.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:46<00:07, 73.55it/s, est. speed input: 7026.54 toks/s, output: 3364.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:46<00:10, 56.62it/s, est. speed input: 7077.84 toks/s, output: 3390.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:46<00:07, 75.03it/s, est. speed input: 7249.38 toks/s, output: 3487.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:46<00:07, 68.51it/s, est. speed input: 7319.26 toks/s, output: 3532.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:46<00:05, 90.12it/s, est. speed input: 7496.63 toks/s, output: 3650.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:47<00:05, 97.52it/s, est. speed input: 7665.15 toks/s, output: 3782.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:47<00:04, 103.34it/s, est. speed input: 7793.32 toks/s, output: 3846.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:47<00:04, 108.51it/s, est. speed input: 7914.90 toks/s, output: 3916.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:47<00:04, 108.71it/s, est. speed input: 8045.07 toks/s, output: 4006.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:47<00:03, 130.47it/s, est. speed input: 8263.57 toks/s, output: 4177.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:47<00:03, 124.55it/s, est. speed input: 8430.12 toks/s, output: 4295.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:47<00:03, 124.80it/s, est. speed input: 8560.46 toks/s, output: 4400.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:48<00:03, 107.58it/s, est. speed input: 8672.43 toks/s, output: 4468.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:48<00:03, 113.67it/s, est. speed input: 8801.30 toks/s, output: 4569.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:48<00:02, 121.58it/s, est. speed input: 9010.92 toks/s, output: 4756.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:48<00:02, 135.70it/s, est. speed input: 9189.69 toks/s, output: 4927.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:48<00:01, 156.13it/s, est. speed input: 9401.39 toks/s, output: 5073.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:48<00:02, 132.33it/s, est. speed input: 9541.79 toks/s, output: 5176.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:49<00:02, 116.87it/s, est. speed input: 9655.00 toks/s, output: 5260.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:49<00:02, 87.94it/s, est. speed input: 9738.45 toks/s, output: 5361.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:49<00:02, 100.06it/s, est. speed input: 9896.72 toks/s, output: 5522.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:49<00:02, 99.33it/s, est. speed input: 10045.40 toks/s, output: 5686.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:49<00:01, 101.49it/s, est. speed input: 10156.85 toks/s, output: 5804.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:49<00:01, 104.73it/s, est. speed input: 10266.37 toks/s, output: 5910.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:50<00:01, 92.18it/s, est. speed input: 10353.77 toks/s, output: 6019.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:50<00:01, 88.53it/s, est. speed input: 10451.72 toks/s, output: 6123.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:50<00:01, 89.62it/s, est. speed input: 10561.60 toks/s, output: 6241.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:50<00:01, 92.59it/s, est. speed input: 10672.91 toks/s, output: 6358.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:50<00:01, 71.60it/s, est. speed input: 10714.93 toks/s, output: 6399.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:51<00:01, 80.06it/s, est. speed input: 10821.06 toks/s, output: 6502.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:51<00:00, 80.96it/s, est. speed input: 10884.60 toks/s, output: 6578.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:51<00:01, 53.95it/s, est. speed input: 10946.72 toks/s, output: 6701.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:51<00:00, 54.90it/s, est. speed input: 11008.32 toks/s, output: 6779.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:52<00:00, 42.02it/s, est. speed input: 11009.37 toks/s, output: 6827.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:52<00:00, 45.55it/s, est. speed input: 11060.50 toks/s, output: 6912.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:52<00:00, 36.27it/s, est. speed input: 11061.26 toks/s, output: 6968.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:53<00:00, 24.34it/s, est. speed input: 10989.84 toks/s, output: 6933.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:55<00:00, 10.05it/s, est. speed input: 10675.86 toks/s, output: 6772.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:55<00:00, 11.82it/s, est. speed input: 10686.64 toks/s, output: 6814.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:03<00:00, 2.33it/s, est. speed input: 9393.63 toks/s, output: 6034.57 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:03<00:00, 20.23it/s, est. speed input: 9393.63 toks/s, output: 6034.57 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.11782528460025787, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006631370051763952}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.24057409167289734, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00035345900687389076}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.23914918303489685, 'actor/pg_clipfrac': 0.0010787486098706722, 'actor/ppo_kl': -0.0003281174576841295}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.5448037385940552, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018604546785354614}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.08360586315393448, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007625233847647905}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.1519758552312851, 'actor/pg_clipfrac': 0.0010460250778123736, 'actor/ppo_kl': 0.0014503091806545854}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.19265230000019073, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016323148738592863}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.05733976513147354, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009039600263349712}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00014476742944680154, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.2645806388463825e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.31202012300491333, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00034763544681482017}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.2824387550354004, 'actor/pg_clipfrac': 0.001687763724476099, 'actor/ppo_kl': 0.0002640527090989053}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.5758761167526245, 'actor/pg_clipfrac': 0.0010638297535479069, 'actor/ppo_kl': 0.0005603404715657234}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.8933912515640259, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000978362513706088}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.2340630143880844, 'actor/pg_clipfrac': 0.0023781212512403727, 'actor/ppo_kl': 0.00031487696105614305}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.29174262285232544, 'actor/pg_clipfrac': 0.0014992504147812724, 'actor/ppo_kl': -0.0011473230551928282}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0002817666972987354, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001339442329481244}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.21263939142227173, 'actor/pg_clipfrac': 0.0010214505018666387, 'actor/ppo_kl': 0.001017096103169024}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.5126794576644897, 'actor/pg_clipfrac': 0.002650176640599966, 'actor/ppo_kl': 0.0010457595344632864}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.051304399967193604, 'actor/pg_clipfrac': 0.0007886435487307608, 'actor/ppo_kl': 0.0009150715777650476}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3146318197250366, 'actor/pg_clipfrac': 0.000684931525029242, 'actor/ppo_kl': -0.0001743473403621465}
[36m(Runner pid=3309020)[0m Step 41
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.252
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.017
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.015
[36m(Runner pid=3309020)[0m ppo_kl: 0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.655
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.655
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 985331
[36m(Runner pid=3309020)[0m balanced_min: 985330
[36m(Runner pid=3309020)[0m max: 992299
[36m(Runner pid=3309020)[0m mean: 985330.5
[36m(Runner pid=3309020)[0m min: 978362
[36m(Runner pid=3309020)[0m minmax_diff: 13937
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.816
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.121
[36m(Runner pid=3309020)[0m throughput: 1166.881
[36m(Runner pid=3309020)[0m time_per_step: 844.414
[36m(Runner pid=3309020)[0m total_num_tokens: 1970661
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 709.0
[36m(Runner pid=3309020)[0m mean: 467.01
[36m(Runner pid=3309020)[0m min: 414.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1249.0
[36m(Runner pid=3309020)[0m mean: 302.78
[36m(Runner pid=3309020)[0m min: 52.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.31
[36m(Runner pid=3309020)[0m format: 0.999
[36m(Runner pid=3309020)[0m overall: 0.655
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.81553565211467e-05
[36m(Runner pid=3309020)[0m gen: 0.131
[36m(Runner pid=3309020)[0m old: 0.043
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.285
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.154
[36m(Runner pid=3309020)[0m gen: 101.791
[36m(Runner pid=3309020)[0m old: 85.636
[36m(Runner pid=3309020)[0m ref: 87.647
[36m(Runner pid=3309020)[0m reward: 6.163
[36m(Runner pid=3309020)[0m step: 844.414
[36m(Runner pid=3309020)[0m update_actor: 562.431
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 42; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.61 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:42:57 [executor_base.py:219] It took 0.339259 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.52 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.70 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:42:57 [executor_base.py:219] It took 0.340764 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:44:31 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:44:32 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.79 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:44:32 [executor_base.py:208] It took 0.325955 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.79 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:44:43 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:44:43 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:44:43 [executor_base.py:208] It took 0.327713 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.21645595133304596, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002708507527131587}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.08415188640356064, 'actor/pg_clipfrac': 0.0007342143799178302, 'actor/ppo_kl': -0.0003976079751737416}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.08512425422668457, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006532336701638997}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.6152417063713074, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0003226198605261743, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00045082991709932685, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -8.786957914708182e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.6653624176979065, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00025204900885000825, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014893811894580722}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1798119693994522, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.24280178546905518, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.4474423825740814, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00032971962355077267, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00032450712751597166, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.10646183788776398, 'actor/pg_clipfrac': 0.0006116207805462182, 'actor/ppo_kl': -0.00024596397997811437}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3819420635700226, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.38295483589172363, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0004177494265604764, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013327443739399314}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.173945814371109, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00034372825757600367}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.23023326694965363, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001978387124836445}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.27659323811531067, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007316648843698204}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5140275955200195, 'actor/pg_clipfrac': 0.0018331805476918817, 'actor/ppo_kl': -0.0008708995883353055}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.0008490202017128468, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010251260828226805}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.5290156006813049, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001252584159374237}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.42902272939682007, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00016352349484805018}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2695629298686981, 'actor/pg_clipfrac': 0.0008368201088160276, 'actor/ppo_kl': -0.0007056152680888772}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0001307346683461219, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008132001385092735}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3644983172416687, 'actor/pg_clipfrac': 0.0007824726053513587, 'actor/ppo_kl': -0.00032000921783037484}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0003084036579821259, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003383368893992156}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.3841163218021393, 'actor/pg_clipfrac': 0.0004810004902537912, 'actor/ppo_kl': -1.4874383850838058e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.42882803082466125, 'actor/pg_clipfrac': 0.002430133754387498, 'actor/ppo_kl': 9.700131158751901e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.313261479139328, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025539944181218743}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00023775811132509261, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002132222434738651}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.22020921111106873, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000888883660081774}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00026963523123413324, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009486620547249913}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.17339524626731873, 'actor/pg_clipfrac': 0.0023885350674390793, 'actor/ppo_kl': 0.0006635431782342494}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.1868717074394226, 'actor/pg_clipfrac': 0.000604229629971087, 'actor/ppo_kl': 0.00016272046195808798}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.17977593839168549, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000642619444988668}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.16412749886512756, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001455053803510964}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1954202800989151, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012877058470621705}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.6220188140869141, 'actor/pg_clipfrac': 0.0011441647075116634, 'actor/ppo_kl': -0.001251033041626215}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5146642327308655, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005102718714624643}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.05390008166432381, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00011697094305418432}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00020000427321065217, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002077204262604937}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.3391963243484497, 'actor/pg_clipfrac': 0.0009910803055390716, 'actor/ppo_kl': -0.0008292307029478252}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.34886473417282104, 'actor/pg_clipfrac': 0.0007654037326574326, 'actor/ppo_kl': -0.0011127536417916417}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00019376359705347568, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011925633298233151}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.032453715801239014, 'actor/pg_clipfrac': 0.0028763182926923037, 'actor/ppo_kl': 0.001216277596540749}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.2622244954109192, 'actor/pg_clipfrac': 0.0018159806495532393, 'actor/ppo_kl': 0.0002875305071938783}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00014983680739533156, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006860084249638021}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.12367004156112671, 'actor/pg_clipfrac': 0.000641436839941889, 'actor/ppo_kl': 0.0008715286385267973}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.1903676688671112, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005056443624198437}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.10597044229507446, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002584716072306037}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.027803758159279823, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013542660744860768}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.22492974996566772, 'actor/pg_clipfrac': 0.0019607844296842813, 'actor/ppo_kl': 0.0003362019779160619}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0001862866192823276, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.2686135050898883e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -1.0276050567626953, 'actor/pg_clipfrac': 0.001949317753314972, 'actor/ppo_kl': -0.0015732391038909554}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0003149959084112197, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009750339668244123}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.5264042019844055, 'actor/pg_clipfrac': 0.0015540015883743763, 'actor/ppo_kl': -3.833963546640007e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.05887198448181152, 'actor/pg_clipfrac': 0.0008880994864739478, 'actor/ppo_kl': 0.0003905711346305907}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.04273158684372902, 'actor/pg_clipfrac': 0.0005813953466713428, 'actor/ppo_kl': -0.000994739937596023}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0004151368048042059, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010697728721424937}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.44690006971359253, 'actor/pg_clipfrac': 0.001398601452820003, 'actor/ppo_kl': 0.0015976165886968374}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.07071990519762039, 'actor/pg_clipfrac': 0.002369668334722519, 'actor/ppo_kl': 0.0010304247261956334}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.03370166942477226, 'actor/pg_clipfrac': 0.0019417476141825318, 'actor/ppo_kl': 0.001225791871547699}
[36m(Runner pid=3309020)[0m Step 42
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.259
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.026
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.013
[36m(Runner pid=3309020)[0m ppo_kl: 2.9280773831885652e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.65
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.65
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 990158
[36m(Runner pid=3309020)[0m balanced_min: 989787
[36m(Runner pid=3309020)[0m max: 1003589
[36m(Runner pid=3309020)[0m mean: 989972.5
[36m(Runner pid=3309020)[0m min: 976356
[36m(Runner pid=3309020)[0m minmax_diff: 27233
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 112.076
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.121
[36m(Runner pid=3309020)[0m throughput: 1135.582
[36m(Runner pid=3309020)[0m time_per_step: 871.776
[36m(Runner pid=3309020)[0m total_num_tokens: 1979945
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 668.0
[36m(Runner pid=3309020)[0m mean: 464.578
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3416.0
[36m(Runner pid=3309020)[0m mean: 308.838
[36m(Runner pid=3309020)[0m min: 58.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.302
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.65
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.152
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.287
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.379
[36m(Runner pid=3309020)[0m gen: 119.834
[36m(Runner pid=3309020)[0m old: 87.048
[36m(Runner pid=3309020)[0m ref: 88.023
[36m(Runner pid=3309020)[0m reward: 6.924
[36m(Runner pid=3309020)[0m step: 871.776
[36m(Runner pid=3309020)[0m update_actor: 568.955
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 43; batch size: 512
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:19<1:24:44, 3.99s/it, est. speed input: 113.09 toks/s, output: 22.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:24<45:14, 2.14s/it, est. speed input: 185.32 toks/s, output: 39.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<27:33, 1.31s/it, est. speed input: 264.34 toks/s, output: 64.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:26<18:14, 1.15it/s, est. speed input: 341.29 toks/s, output: 85.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<14:17, 1.46it/s, est. speed input: 401.06 toks/s, output: 106.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:28<10:00, 2.08it/s, est. speed input: 476.15 toks/s, output: 127.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:29<07:17, 2.85it/s, est. speed input: 545.29 toks/s, output: 147.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:30<07:03, 2.93it/s, est. speed input: 587.02 toks/s, output: 162.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<07:31, 2.73it/s, est. speed input: 621.37 toks/s, output: 177.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<05:45, 3.56it/s, est. speed input: 679.41 toks/s, output: 193.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:33<03:11, 6.38it/s, est. speed input: 815.70 toks/s, output: 242.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:34<03:35, 5.64it/s, est. speed input: 861.24 toks/s, output: 255.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<02:47, 7.22it/s, est. speed input: 921.42 toks/s, output: 277.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:35<02:11, 9.18it/s, est. speed input: 982.98 toks/s, output: 301.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:35<01:20, 14.75it/s, est. speed input: 1107.03 toks/s, output: 348.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:35, 12.45it/s, est. speed input: 1156.40 toks/s, output: 368.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:36<01:03, 18.54it/s, est. speed input: 1275.84 toks/s, output: 413.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:36<00:57, 20.60it/s, est. speed input: 1334.21 toks/s, output: 434.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:36<00:54, 21.35it/s, est. speed input: 1440.87 toks/s, output: 477.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:36<00:49, 23.28it/s, est. speed input: 1494.55 toks/s, output: 489.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:36<00:45, 25.24it/s, est. speed input: 1549.85 toks/s, output: 510.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:37<00:33, 34.20it/s, est. speed input: 1725.79 toks/s, output: 574.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:37<00:27, 40.28it/s, est. speed input: 1902.91 toks/s, output: 650.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:37<00:33, 33.42it/s, est. speed input: 1949.14 toks/s, output: 668.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:38<00:38, 28.77it/s, est. speed input: 1992.81 toks/s, output: 689.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:38<00:30, 36.74it/s, est. speed input: 2111.54 toks/s, output: 743.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:38<00:32, 33.49it/s, est. speed input: 2162.30 toks/s, output: 763.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:38<00:28, 38.01it/s, est. speed input: 2271.70 toks/s, output: 806.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:38<00:28, 37.87it/s, est. speed input: 2326.57 toks/s, output: 830.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:39<00:31, 34.59it/s, est. speed input: 2422.24 toks/s, output: 871.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:39<00:33, 32.12it/s, est. speed input: 2468.87 toks/s, output: 895.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:39<00:43, 24.43it/s, est. speed input: 2503.72 toks/s, output: 916.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:39<00:33, 31.42it/s, est. speed input: 2602.39 toks/s, output: 961.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:37, 28.34it/s, est. speed input: 2644.87 toks/s, output: 983.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:40<00:29, 35.16it/s, est. speed input: 2750.11 toks/s, output: 1024.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:40<00:33, 30.81it/s, est. speed input: 2786.04 toks/s, output: 1040.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:40<00:34, 30.29it/s, est. speed input: 2831.30 toks/s, output: 1057.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:40<00:31, 32.78it/s, est. speed input: 2882.78 toks/s, output: 1079.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:40<00:27, 36.52it/s, est. speed input: 2976.48 toks/s, output: 1127.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:29, 33.51it/s, est. speed input: 3064.42 toks/s, output: 1172.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:41<00:26, 37.69it/s, est. speed input: 3210.58 toks/s, output: 1250.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:41<00:32, 30.03it/s, est. speed input: 3244.09 toks/s, output: 1268.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:42<00:48, 20.23it/s, est. speed input: 3256.58 toks/s, output: 1282.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:42<00:46, 21.03it/s, est. speed input: 3300.86 toks/s, output: 1294.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:43<00:41, 23.36it/s, est. speed input: 3376.40 toks/s, output: 1334.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:43<00:31, 30.40it/s, est. speed input: 3473.77 toks/s, output: 1386.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:43<00:25, 37.22it/s, est. speed input: 3565.72 toks/s, output: 1442.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:43<00:31, 30.28it/s, est. speed input: 3589.54 toks/s, output: 1461.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:43<00:19, 48.23it/s, est. speed input: 3784.17 toks/s, output: 1580.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:44<00:12, 70.44it/s, est. speed input: 4044.22 toks/s, output: 1731.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:44<00:12, 71.63it/s, est. speed input: 4142.70 toks/s, output: 1790.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:44<00:16, 53.21it/s, est. speed input: 4213.27 toks/s, output: 1836.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:44<00:13, 65.21it/s, est. speed input: 4360.32 toks/s, output: 1914.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:44<00:12, 67.43it/s, est. speed input: 4496.27 toks/s, output: 1995.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:45<00:15, 52.45it/s, est. speed input: 4566.63 toks/s, output: 2036.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:45<00:14, 57.87it/s, est. speed input: 4656.77 toks/s, output: 2088.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:45<00:18, 45.26it/s, est. speed input: 4719.65 toks/s, output: 2130.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:45<00:15, 52.95it/s, est. speed input: 4848.04 toks/s, output: 2198.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:46<00:13, 58.57it/s, est. speed input: 4938.70 toks/s, output: 2253.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:46<00:09, 81.28it/s, est. speed input: 5130.61 toks/s, output: 2357.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:46<00:11, 66.55it/s, est. speed input: 5239.79 toks/s, output: 2411.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:46<00:12, 58.61it/s, est. speed input: 5315.44 toks/s, output: 2465.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:46<00:08, 81.23it/s, est. speed input: 5503.90 toks/s, output: 2578.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:46<00:08, 79.69it/s, est. speed input: 5631.55 toks/s, output: 2649.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:47<00:07, 96.76it/s, est. speed input: 5820.26 toks/s, output: 2779.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:47<00:10, 66.68it/s, est. speed input: 5925.28 toks/s, output: 2849.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:47<00:08, 76.72it/s, est. speed input: 6060.43 toks/s, output: 2947.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:47<00:09, 64.50it/s, est. speed input: 6168.74 toks/s, output: 3025.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:48<00:08, 75.57it/s, est. speed input: 6296.21 toks/s, output: 3108.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:48<00:07, 86.36it/s, est. speed input: 6429.20 toks/s, output: 3216.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:48<00:08, 73.05it/s, est. speed input: 6553.83 toks/s, output: 3295.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:48<00:06, 91.26it/s, est. speed input: 6727.68 toks/s, output: 3404.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:48<00:04, 124.56it/s, est. speed input: 7035.98 toks/s, output: 3608.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:48<00:04, 107.14it/s, est. speed input: 7182.80 toks/s, output: 3677.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:49<00:04, 112.90it/s, est. speed input: 7348.44 toks/s, output: 3781.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:49<00:04, 105.86it/s, est. speed input: 7464.51 toks/s, output: 3875.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:49<00:04, 104.30it/s, est. speed input: 7580.61 toks/s, output: 3968.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:49<00:04, 100.52it/s, est. speed input: 7702.01 toks/s, output: 4063.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:49<00:04, 91.89it/s, est. speed input: 7808.05 toks/s, output: 4158.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:50<00:05, 73.72it/s, est. speed input: 7938.27 toks/s, output: 4244.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:50<00:03, 113.31it/s, est. speed input: 8236.19 toks/s, output: 4456.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:50<00:03, 107.13it/s, est. speed input: 8344.69 toks/s, output: 4530.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:50<00:03, 106.34it/s, est. speed input: 8462.45 toks/s, output: 4628.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:50<00:02, 125.08it/s, est. speed input: 8679.40 toks/s, output: 4794.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:50<00:01, 156.71it/s, est. speed input: 8960.40 toks/s, output: 5019.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:51<00:02, 141.37it/s, est. speed input: 9102.00 toks/s, output: 5165.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:51<00:01, 147.03it/s, est. speed input: 9250.18 toks/s, output: 5312.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:51<00:02, 103.04it/s, est. speed input: 9377.66 toks/s, output: 5429.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:51<00:02, 108.45it/s, est. speed input: 9490.74 toks/s, output: 5530.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:51<00:01, 124.03it/s, est. speed input: 9656.04 toks/s, output: 5680.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:51<00:01, 118.58it/s, est. speed input: 9771.62 toks/s, output: 5795.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:52<00:01, 94.67it/s, est. speed input: 9854.88 toks/s, output: 5914.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:52<00:01, 104.77it/s, est. speed input: 10002.03 toks/s, output: 6066.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:52<00:01, 76.62it/s, est. speed input: 10059.70 toks/s, output: 6135.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:52<00:01, 82.63it/s, est. speed input: 10162.45 toks/s, output: 6260.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:52<00:01, 92.75it/s, est. speed input: 10268.34 toks/s, output: 6369.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:52<00:01, 100.39it/s, est. speed input: 10387.82 toks/s, output: 6478.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:53<00:00, 96.74it/s, est. speed input: 10558.82 toks/s, output: 6655.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:53<00:00, 88.69it/s, est. speed input: 10649.65 toks/s, output: 6787.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:53<00:00, 85.68it/s, est. speed input: 10713.73 toks/s, output: 6844.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:53<00:00, 83.74it/s, est. speed input: 10767.47 toks/s, output: 6933.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:53<00:00, 75.51it/s, est. speed input: 10846.95 toks/s, output: 7030.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:54<00:00, 58.12it/s, est. speed input: 10874.38 toks/s, output: 7095.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:54<00:00, 36.47it/s, est. speed input: 10837.39 toks/s, output: 7141.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 11.71it/s, est. speed input: 10447.47 toks/s, output: 6930.15 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 22.33it/s, est. speed input: 10447.47 toks/s, output: 6930.15 toks/s]
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:57:30 [executor_base.py:219] It took 0.342225 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.53 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.77 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:58:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:57:30 [executor_base.py:219] It took 0.339827 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:58:58 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 07:58:58 [executor_base.py:208] It took 0.324318 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.85 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:59:09 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:59:09 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 07:59:09 [executor_base.py:208] It took 0.327771 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.44687360525131226, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00025520662893541157, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0001914120075525716, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.42197760939598083, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002224049821961671, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0002160214353352785, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005557798431254923}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.7021726965904236, 'actor/pg_clipfrac': 0.0031545741949230433, 'actor/ppo_kl': -0.0006592326099053025}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002773529849946499, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004830456746276468}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5956612825393677, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.2969576418399811, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013297703117132187}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.31988781690597534, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00018978863954544067, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017062134575098753}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.08861012756824493, 'actor/pg_clipfrac': 0.0007662835414521396, 'actor/ppo_kl': -0.0015675803879275918}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00030244834488257766, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00038614473305642605}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.09524685144424438, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001343060634098947}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.5368825793266296, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00036970656947232783}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.21938516199588776, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008507148013450205}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.7312397956848145, 'actor/pg_clipfrac': 0.0010570824379101396, 'actor/ppo_kl': -0.0001509728899691254}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.09055513888597488, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0027812537737190723}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.13044263422489166, 'actor/pg_clipfrac': 0.0011428571306169033, 'actor/ppo_kl': -0.0004773537802975625}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.300295889377594, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00031591442530043423}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.14895778894424438, 'actor/pg_clipfrac': 0.005235602147877216, 'actor/ppo_kl': -0.00010377074795542285}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.009345791302621365, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001401571906171739}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.07244620472192764, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003558729658834636}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00020025709818582982, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000602459185756743}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.07702924311161041, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006299821543507278}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.11428812891244888, 'actor/pg_clipfrac': 0.0018248175038024783, 'actor/ppo_kl': 0.0015650874702259898}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00036559993168339133, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002205792348831892}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.49580100178718567, 'actor/pg_clipfrac': 0.0021586616057902575, 'actor/ppo_kl': -0.0004965776461176574}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003347817691974342, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00013487471733242273}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.04415406659245491, 'actor/pg_clipfrac': 0.0028873917181044817, 'actor/ppo_kl': -0.0001939768117154017}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.4434460401535034, 'actor/pg_clipfrac': 0.0034207524731755257, 'actor/ppo_kl': -0.0009234698954969645}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.000582982087507844, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -3.164264853694476e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.12216373533010483, 'actor/pg_clipfrac': 0.002892960561439395, 'actor/ppo_kl': 0.00029980135150253773}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.26994284987449646, 'actor/pg_clipfrac': 0.002139800228178501, 'actor/ppo_kl': -0.00018611070117913187}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.07605094462633133, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009318121592514217}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.05335443094372749, 'actor/pg_clipfrac': 0.001015744055621326, 'actor/ppo_kl': -0.000669907545670867}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00025304118753410876, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017497024964541197}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00028478223248384893, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014016390778124332}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00028029270470142365, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00038341572508215904}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00023307494120672345, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002642417384777218}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00021842804562766105, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005032866611145437}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.15304653346538544, 'actor/pg_clipfrac': 0.002918287878856063, 'actor/ppo_kl': 0.0002451814943924546}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.3790484070777893, 'actor/pg_clipfrac': 0.0020325202494859695, 'actor/ppo_kl': -0.0007729007047601044}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0001731534575810656, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004683480365201831}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.11312870681285858, 'actor/pg_clipfrac': 0.0011123470030725002, 'actor/ppo_kl': -0.0001349825761280954}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.29890263080596924, 'actor/pg_clipfrac': 0.0038080730009824038, 'actor/ppo_kl': 0.0005644996417686343}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.45392075181007385, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.7986052625929005e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0496162474155426, 'actor/pg_clipfrac': 0.0011179429711773992, 'actor/ppo_kl': -1.4313111023511738e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.12323644757270813, 'actor/pg_clipfrac': 0.0009242144296877086, 'actor/ppo_kl': 0.00041920068906620145}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00041402215720154345, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003639445931185037}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5980557203292847, 'actor/pg_clipfrac': 0.0030706243123859167, 'actor/ppo_kl': -0.000997074763290584}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.31411734223365784, 'actor/pg_clipfrac': 0.001135073835030198, 'actor/ppo_kl': -0.0002464463177602738}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00039748879498802125, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003367357421666384}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2975592613220215, 'actor/pg_clipfrac': 0.0008539709378965199, 'actor/ppo_kl': -0.0003337257367093116}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.07932993769645691, 'actor/pg_clipfrac': 0.0007434944272972643, 'actor/ppo_kl': -0.0009704632102511823}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.47690725326538086, 'actor/pg_clipfrac': 0.0017064845887944102, 'actor/ppo_kl': 0.0006242970121093094}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.31596070528030396, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00015699010691605508}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.38651201128959656, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00022910407278686762}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0004053155134897679, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00047683005686849356}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.5485081672668457, 'actor/pg_clipfrac': 0.0022547913249582052, 'actor/ppo_kl': 0.0002225308126071468}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.06294156610965729, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006396561511792243}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.31800293922424316, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007679242989979684}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.3190980851650238, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017058331286534667}
[36m(Runner pid=3309020)[0m Step 43
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.256
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.031
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.018
[36m(Runner pid=3309020)[0m ppo_kl: 3.708097501622376e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.033
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.033
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.673
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.673
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 984696
[36m(Runner pid=3309020)[0m balanced_min: 984398
[36m(Runner pid=3309020)[0m max: 996270
[36m(Runner pid=3309020)[0m mean: 984547.0
[36m(Runner pid=3309020)[0m min: 972824
[36m(Runner pid=3309020)[0m minmax_diff: 23446
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.666
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.121
[36m(Runner pid=3309020)[0m throughput: 1136.433
[36m(Runner pid=3309020)[0m time_per_step: 866.348
[36m(Runner pid=3309020)[0m total_num_tokens: 1969094
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 807.0
[36m(Runner pid=3309020)[0m mean: 464.039
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2739.0
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:28<2:00:42, 5.68s/it, est. speed input: 82.39 toks/s, output: 21.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:29<51:08, 2.42s/it, est. speed input: 159.86 toks/s, output: 41.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:29<29:16, 1.39s/it, est. speed input: 233.38 toks/s, output: 56.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:30<18:51, 1.11it/s, est. speed input: 300.32 toks/s, output: 77.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:32<14:12, 1.47it/s, est. speed input: 359.19 toks/s, output: 98.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:33<07:55, 2.62it/s, est. speed input: 492.55 toks/s, output: 137.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:34<07:54, 2.61it/s, est. speed input: 533.94 toks/s, output: 154.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:35<06:09, 3.34it/s, est. speed input: 591.27 toks/s, output: 175.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<03:41, 5.52it/s, est. speed input: 714.90 toks/s, output: 219.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<03:05, 6.56it/s, est. speed input: 770.29 toks/s, output: 243.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:36<02:41, 7.53it/s, est. speed input: 823.92 toks/s, output: 266.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<02:26, 8.24it/s, est. speed input: 873.18 toks/s, output: 289.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<01:55, 10.41it/s, est. speed input: 932.06 toks/s, output: 309.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<01:32, 12.94it/s, est. speed input: 991.32 toks/s, output: 328.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<00:58, 20.27it/s, est. speed input: 1112.88 toks/s, output: 371.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<00:45, 25.99it/s, est. speed input: 1232.19 toks/s, output: 415.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:37<00:40, 28.71it/s, est. speed input: 1346.36 toks/s, output: 463.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<00:29, 38.66it/s, est. speed input: 1524.83 toks/s, output: 535.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<00:32, 35.19it/s, est. speed input: 1633.85 toks/s, output: 580.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:38<00:23, 47.73it/s, est. speed input: 1806.68 toks/s, output: 644.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:39<00:43, 25.89it/s, est. speed input: 1889.86 toks/s, output: 682.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:45, 24.46it/s, est. speed input: 1930.88 toks/s, output: 702.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:39<00:41, 26.48it/s, est. speed input: 1983.61 toks/s, output: 728.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:30, 35.68it/s, est. speed input: 2148.34 toks/s, output: 800.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:40<00:39, 27.15it/s, est. speed input: 2277.25 toks/s, output: 867.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:37, 28.85it/s, est. speed input: 2328.22 toks/s, output: 891.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:26, 39.49it/s, est. speed input: 2481.06 toks/s, output: 951.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:41<00:22, 46.61it/s, est. speed input: 2591.86 toks/s, output: 1004.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:41<00:20, 49.58it/s, est. speed input: 2695.83 toks/s, output: 1038.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:41<00:25, 39.79it/s, est. speed input: 2789.01 toks/s, output: 1088.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:41<00:22, 44.74it/s, est. speed input: 2893.06 toks/s, output: 1136.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:41<00:22, 45.75it/s, est. speed input: 2985.57 toks/s, output: 1188.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:42<00:17, 57.40it/s, est. speed input: 3146.54 toks/s, output: 1259.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:42<00:11, 81.83it/s, est. speed input: 3357.81 toks/s, output: 1367.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:42<00:14, 67.69it/s, est. speed input: 3488.84 toks/s, output: 1441.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:42<00:14, 67.22it/s, est. speed input: 3581.36 toks/s, output: 1495.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:15, 61.97it/s, est. speed input: 3675.67 toks/s, output: 1552.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:43<00:11, 79.97it/s, est. speed input: 3880.29 toks/s, output: 1651.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:43<00:11, 81.12it/s, est. speed input: 4069.72 toks/s, output: 1727.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:11, 78.87it/s, est. speed input: 4255.64 toks/s, output: 1813.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:11, 73.10it/s, est. speed input: 4345.21 toks/s, output: 1847.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:44<00:15, 55.12it/s, est. speed input: 4418.14 toks/s, output: 1885.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:44<00:13, 62.46it/s, est. speed input: 4644.69 toks/s, output: 2019.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:15, 54.40it/s, est. speed input: 4717.54 toks/s, output: 2063.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:44<00:13, 59.04it/s, est. speed input: 4808.83 toks/s, output: 2116.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:44<00:09, 81.76it/s, est. speed input: 5049.34 toks/s, output: 2276.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:45<00:09, 78.02it/s, est. speed input: 5135.30 toks/s, output: 2334.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:45<00:08, 90.97it/s, est. speed input: 5367.25 toks/s, output: 2467.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:45<00:08, 92.15it/s, est. speed input: 5456.20 toks/s, output: 2511.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:45<00:07, 93.12it/s, est. speed input: 5550.50 toks/s, output: 2567.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:45<00:12, 59.11it/s, est. speed input: 5608.40 toks/s, output: 2610.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:08, 86.36it/s, est. speed input: 5841.60 toks/s, output: 2744.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:46<00:07, 88.23it/s, est. speed input: 5975.81 toks/s, output: 2806.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:46<00:10, 63.52it/s, est. speed input: 6070.65 toks/s, output: 2869.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:46<00:10, 63.37it/s, est. speed input: 6186.23 toks/s, output: 2933.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:46<00:07, 81.87it/s, est. speed input: 6414.12 toks/s, output: 3104.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:47<00:06, 98.41it/s, est. speed input: 6612.41 toks/s, output: 3234.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:47<00:05, 102.10it/s, est. speed input: 6736.70 toks/s, output: 3306.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:47<00:04, 120.19it/s, est. speed input: 6965.14 toks/s, output: 3469.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:47<00:04, 115.21it/s, est. speed input: 7134.79 toks/s, output: 3591.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:47<00:05, 104.53it/s, est. speed input: 7257.65 toks/s, output: 3669.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:47<00:04, 110.82it/s, est. speed input: 7447.89 toks/s, output: 3789.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:48<00:05, 90.03it/s, est. speed input: 7558.94 toks/s, output: 3876.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:48<00:05, 90.50it/s, est. speed input: 7682.36 toks/s, output: 3960.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:48<00:04, 101.54it/s, est. speed input: 7818.00 toks/s, output: 4059.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:48<00:03, 122.95it/s, est. speed input: 8038.43 toks/s, output: 4200.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:48<00:03, 115.55it/s, est. speed input: 8192.40 toks/s, output: 4300.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:48<00:03, 118.22it/s, est. speed input: 8320.50 toks/s, output: 4377.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:02, 152.88it/s, est. speed input: 8588.29 toks/s, output: 4593.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:49<00:03, 119.71it/s, est. speed input: 8736.61 toks/s, output: 4700.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:49<00:02, 120.24it/s, est. speed input: 8899.23 toks/s, output: 4827.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:49<00:02, 114.23it/s, est. speed input: 9016.08 toks/s, output: 4914.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:49<00:02, 118.30it/s, est. speed input: 9140.60 toks/s, output: 5000.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:49<00:02, 122.57it/s, est. speed input: 9263.06 toks/s, output: 5089.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:49<00:02, 119.02it/s, est. speed input: 9374.56 toks/s, output: 5177.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:50<00:02, 120.52it/s, est. speed input: 9525.80 toks/s, output: 5308.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:50<00:02, 89.80it/s, est. speed input: 9643.60 toks/s, output: 5430.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:50<00:02, 100.18it/s, est. speed input: 9759.54 toks/s, output: 5536.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:50<00:01, 108.32it/s, est. speed input: 9873.14 toks/s, output: 5641.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:50<00:01, 124.17it/s, est. speed input: 10031.34 toks/s, output: 5806.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:51<00:02, 78.16it/s, est. speed input: 10097.04 toks/s, output: 5861.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:51<00:03, 50.47it/s, est. speed input: 10115.06 toks/s, output: 5908.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:02, 55.06it/s, est. speed input: 10175.97 toks/s, output: 5999.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:51<00:02, 57.60it/s, est. speed input: 10242.19 toks/s, output: 6071.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:52<00:01, 68.34it/s, est. speed input: 10345.40 toks/s, output: 6182.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:52<00:01, 68.91it/s, est. speed input: 10402.67 toks/s, output: 6246.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:52<00:01, 92.03it/s, est. speed input: 10560.96 toks/s, output: 6407.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:52<00:00, 100.52it/s, est. speed input: 10668.77 toks/s, output: 6519.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:52<00:00, 72.15it/s, est. speed input: 10739.18 toks/s, output: 6601.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:52<00:00, 73.38it/s, est. speed input: 10802.40 toks/s, output: 6660.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:53<00:00, 53.53it/s, est. speed input: 10826.71 toks/s, output: 6720.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:53<00:00, 51.23it/s, est. speed input: 10867.71 toks/s, output: 6791.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:53<00:00, 45.93it/s, est. speed input: 10902.58 toks/s, output: 6838.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 39.15it/s, est. speed input: 10917.24 toks/s, output: 6882.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:54<00:00, 33.09it/s, est. speed input: 10917.99 toks/s, output: 6909.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 13.53it/s, est. speed input: 10683.18 toks/s, output: 6790.60 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.85it/s, est. speed input: 10683.18 toks/s, output: 6790.60 toks/s]
[36m(Runner pid=3309020)[0m Error during comparison
[36m(Runner pid=3309020)[0m Traceback (most recent call last):
[36m(Runner pid=3309020)[0m File "/home/huzhe/env/conda/anaconda3/envs/verl/lib/python3.10/site-packages/math_verify/grader.py", line 809, in compare_single_extraction_wrapper
[36m(Runner pid=3309020)[0m return compare_single_extraction(g, t)
[36m(Runner pid=3309020)[0m File "/home/huzhe/env/conda/anaconda3/envs/verl/lib/python3.10/site-packages/math_verify/utils.py", line 51, in wrapper
[36m(Runner pid=3309020)[0m return func(*args, **kwargs)
[36m(Runner pid=3309020)[0m File "/home/huzhe/env/conda/anaconda3/envs/verl/lib/python3.10/site-packages/math_verify/grader.py", line 789, in compare_single_extraction
[36m(Runner pid=3309020)[0m return sympy_expr_eq(
[36m(Runner pid=3309020)[0m File "/home/huzhe/env/conda/anaconda3/envs/verl/lib/python3.10/site-packages/math_verify/grader.py", line 667, in sympy_expr_eq
[36m(Runner pid=3309020)[0m return sympy_compare_relational(gold, pred, float_rounding, numeric_precision)
[36m(Runner pid=3309020)[0m File "/home/huzhe/env/conda/anaconda3/envs/verl/lib/python3.10/site-packages/math_verify/grader.py", line 344, in sympy_compare_relational
[36m(Runner pid=3309020)[0m if sympy_solve_and_compare(gold, pred, float_rounding, numeric_precision):
[36m(Runner pid=3309020)[0m File "/home/huzhe/env/conda/anaconda3/envs/verl/lib/python3.10/site-packages/math_verify/grader.py", line 278, in sympy_solve_and_compare
[36m(Runner pid=3309020)[0m return all(
[36m(Runner pid=3309020)[0m File "/home/huzhe/env/conda/anaconda3/envs/verl/lib/python3.10/site-packages/math_verify/grader.py", line 282, in
[36m(Runner pid=3309020)[0m for (g_k, g_v), (p_k, p_v) in zip(sorted(g.items()), sorted(p.items()))
[36m(Runner pid=3309020)[0m AttributeError: 'Integer' object has no attribute 'items'
[36m(Runner pid=3309020)[0m mean: 305.138
[36m(Runner pid=3309020)[0m min: 60.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.348
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.673
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.145
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.289
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.424
[36m(Runner pid=3309020)[0m gen: 113.263
[36m(Runner pid=3309020)[0m old: 88.408
[36m(Runner pid=3309020)[0m ref: 88.464
[36m(Runner pid=3309020)[0m reward: 6.67
[36m(Runner pid=3309020)[0m step: 866.348
[36m(Runner pid=3309020)[0m update_actor: 568.398
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 44; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:11:58 [executor_base.py:219] It took 0.339409 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:13:24 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:11:58 [executor_base.py:219] It took 0.340909 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:13:25 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:13:25 [executor_base.py:208] It took 0.325915 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:13:30 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:13:30 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:13:30 [executor_base.py:208] It took 0.327304 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002514147781766951, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008203638135455549}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.19546978175640106, 'actor/pg_clipfrac': 0.001108647440560162, 'actor/ppo_kl': -0.0005481322295963764}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.38711026310920715, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.12955854833126068, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0003189232957083732, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -8.68645147420466e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.37740281224250793, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3641859292984009, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.9973676800727844, 'actor/pg_clipfrac': 0.0035273367539048195, 'actor/ppo_kl': -0.00022769773204345256}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00018388872558716685, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010893625440075994}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.30629807710647583, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00026796318707056344, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009658659691922367}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1271468549966812, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.27695947885513306, 'actor/pg_clipfrac': 0.00045045046135783195, 'actor/ppo_kl': -0.0007141010137274861}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.05701012536883354, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.11089195311069489, 'actor/pg_clipfrac': 0.0008873114711605012, 'actor/ppo_kl': -0.001073513994924724}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3440948724746704, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.507495641708374, 'actor/pg_clipfrac': 0.000841042899992317, 'actor/ppo_kl': -0.0001961136149475351}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.19575372338294983, 'actor/pg_clipfrac': 0.0006653359741903841, 'actor/ppo_kl': -0.0005853288457728922}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00016042584320530295, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000600483501330018}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.38346827030181885, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00053537602070719}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.22474917769432068, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015512381214648485}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.11887867003679276, 'actor/pg_clipfrac': 0.0020703934133052826, 'actor/ppo_kl': 0.0005023928824812174}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002447559090796858, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004689927154686302}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.5608358979225159, 'actor/pg_clipfrac': 0.0010245901066809893, 'actor/ppo_kl': 0.0002518091059755534}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.08830127120018005, 'actor/pg_clipfrac': 0.0021598271559923887, 'actor/ppo_kl': -0.0017239204607903957}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.05403520539402962, 'actor/pg_clipfrac': 0.0007980845985002816, 'actor/ppo_kl': -0.0016497819451615214}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00034703852725215256, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010875912848860025}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.16476713120937347, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0026178481057286263}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00024619614123366773, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004014497681055218}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.33429035544395447, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0019716089591383934}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0004413639544509351, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005445870338007808}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.12753748893737793, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007581081590615213}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.5186137557029724, 'actor/pg_clipfrac': 0.0015455950051546097, 'actor/ppo_kl': 0.0016892298590391874}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.1107722744345665, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00030482723377645016}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.05669504031538963, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001368555473163724}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0003665705444291234, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016188877634704113}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.09141357988119125, 'actor/pg_clipfrac': 0.0015822785208001733, 'actor/ppo_kl': -0.0013593088369816542}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00013838591985404491, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001109199714846909}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.22200645506381989, 'actor/pg_clipfrac': 0.0034722222480922937, 'actor/ppo_kl': 0.0004782720934599638}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.12028972059488297, 'actor/pg_clipfrac': 0.0016380016459152102, 'actor/ppo_kl': 0.002087390748783946}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.15279234945774078, 'actor/pg_clipfrac': 0.0013636363437399268, 'actor/ppo_kl': -0.00037515899748541415}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.12608833611011505, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000231461352086626}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5221496224403381, 'actor/pg_clipfrac': 0.0023980815894901752, 'actor/ppo_kl': 0.0007083970704115927}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.06911572813987732, 'actor/pg_clipfrac': 0.0020505809225142, 'actor/ppo_kl': 0.0012128768721595407}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00017783242219593376, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -9.782899724086747e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00022794386313762516, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002496608765795827}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.07036289572715759, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010955292964354157}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.28660961985588074, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025294889928773046}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00041098962537944317, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001221667742356658}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.39204874634742737, 'actor/pg_clipfrac': 0.002320185536518693, 'actor/ppo_kl': 0.0007635612273588777}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00035629881313070655, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012486870400607586}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5790050625801086, 'actor/pg_clipfrac': 0.004599816165864468, 'actor/ppo_kl': 0.0013580691302195191}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.23089267313480377, 'actor/pg_clipfrac': 0.005361930467188358, 'actor/ppo_kl': 0.0013906429521739483}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.05817997455596924, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005982230068184435}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.7980553507804871, 'actor/pg_clipfrac': 0.0014347202377393842, 'actor/ppo_kl': 0.0005967921460978687}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.09662021696567535, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0019237271044403315}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.29144227504730225, 'actor/pg_clipfrac': 0.0005537098622880876, 'actor/ppo_kl': -0.000411300832638517}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.08883461356163025, 'actor/pg_clipfrac': 0.001054852269589901, 'actor/ppo_kl': -0.0010170413879677653}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.2606143057346344, 'actor/pg_clipfrac': 0.004313000477850437, 'actor/ppo_kl': -0.0016390165546908975}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.13106270134449005, 'actor/pg_clipfrac': 0.0016792610986158252, 'actor/ppo_kl': -0.001013835077174008}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.3107230067253113, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000714889436494559}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.10500261187553406, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017369200941175222}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.14239096641540527, 'actor/pg_clipfrac': 0.001891551073640585, 'actor/ppo_kl': 0.0024109415244311094}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.34605464339256287, 'actor/pg_clipfrac': 0.0008389261784031987, 'actor/ppo_kl': 0.0005607173079624772}
[36m(Runner pid=3309020)[0m Step 44
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.245
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.037
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.008
[36m(Runner pid=3309020)[0m ppo_kl: 2.4999007486314895e-07
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.01
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.657
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.657
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:25<1:47:38, 5.07s/it, est. speed input: 90.41 toks/s, output: 24.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:27<49:56, 2.36s/it, est. speed input: 165.24 toks/s, output: 45.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:29<31:12, 1.48s/it, est. speed input: 231.00 toks/s, output: 65.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:31<21:33, 1.03s/it, est. speed input: 292.20 toks/s, output: 85.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:32<15:33, 1.34it/s, est. speed input: 354.95 toks/s, output: 102.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:33<11:12, 1.86it/s, est. speed input: 417.81 toks/s, output: 123.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:34<08:29, 2.44it/s, est. speed input: 476.42 toks/s, output: 144.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:34<06:04, 3.40it/s, est. speed input: 542.01 toks/s, output: 167.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:34<04:21, 4.72it/s, est. speed input: 608.27 toks/s, output: 186.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<02:24, 8.44it/s, est. speed input: 795.40 toks/s, output: 251.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<02:01, 10.02it/s, est. speed input: 857.63 toks/s, output: 274.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:35<01:14, 16.15it/s, est. speed input: 1042.62 toks/s, output: 341.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:35<01:10, 16.99it/s, est. speed input: 1099.91 toks/s, output: 358.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:36<01:04, 18.49it/s, est. speed input: 1212.33 toks/s, output: 407.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:36<01:12, 16.38it/s, est. speed input: 1257.11 toks/s, output: 428.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<00:51, 22.58it/s, est. speed input: 1383.69 toks/s, output: 467.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<00:52, 21.85it/s, est. speed input: 1536.03 toks/s, output: 517.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:37<00:49, 23.40it/s, est. speed input: 1589.67 toks/s, output: 539.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<00:54, 20.82it/s, est. speed input: 1639.24 toks/s, output: 559.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:49, 23.03it/s, est. speed input: 1701.84 toks/s, output: 578.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:48, 23.40it/s, est. speed input: 1750.65 toks/s, output: 593.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:43, 25.58it/s, est. speed input: 1852.30 toks/s, output: 639.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:38<00:27, 39.85it/s, est. speed input: 2025.06 toks/s, output: 716.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:30, 36.23it/s, est. speed input: 2122.25 toks/s, output: 762.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:29, 37.41it/s, est. speed input: 2224.74 toks/s, output: 809.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:24, 44.87it/s, est. speed input: 2332.62 toks/s, output: 856.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:30, 35.53it/s, est. speed input: 2421.86 toks/s, output: 901.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:37, 28.17it/s, est. speed input: 2458.39 toks/s, output: 924.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:39, 26.58it/s, est. speed input: 2505.09 toks/s, output: 945.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:40<00:36, 29.13it/s, est. speed input: 2552.95 toks/s, output: 972.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:33, 31.74it/s, est. speed input: 2604.17 toks/s, output: 993.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:41<00:42, 24.51it/s, est. speed input: 2636.54 toks/s, output: 1008.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:41<00:24, 42.84it/s, est. speed input: 2797.15 toks/s, output: 1080.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:16, 60.62it/s, est. speed input: 2955.25 toks/s, output: 1136.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:11, 86.95it/s, est. speed input: 3168.76 toks/s, output: 1243.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:41<00:12, 81.46it/s, est. speed input: 3316.66 toks/s, output: 1311.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:41<00:12, 78.15it/s, est. speed input: 3459.49 toks/s, output: 1390.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:42<00:17, 55.03it/s, est. speed input: 3535.05 toks/s, output: 1431.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:42<00:21, 44.54it/s, est. speed input: 3610.92 toks/s, output: 1469.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:42<00:21, 43.40it/s, est. speed input: 3696.54 toks/s, output: 1515.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:43<00:20, 45.30it/s, est. speed input: 3786.92 toks/s, output: 1558.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:43<00:20, 44.11it/s, est. speed input: 3869.27 toks/s, output: 1606.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:43<00:25, 35.31it/s, est. speed input: 3895.40 toks/s, output: 1612.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:43<00:18, 49.50it/s, est. speed input: 4044.20 toks/s, output: 1699.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:44<00:16, 52.33it/s, est. speed input: 4193.12 toks/s, output: 1787.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:44<00:13, 64.91it/s, est. speed input: 4334.27 toks/s, output: 1872.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:44<00:13, 63.63it/s, est. speed input: 4422.49 toks/s, output: 1920.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:44<00:13, 62.79it/s, est. speed input: 4515.46 toks/s, output: 1965.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:44<00:13, 62.28it/s, est. speed input: 4598.58 toks/s, output: 2028.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:45<00:18, 43.54it/s, est. speed input: 4660.73 toks/s, output: 2047.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:45<00:18, 43.10it/s, est. speed input: 4739.83 toks/s, output: 2110.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:45<00:16, 47.84it/s, est. speed input: 4827.04 toks/s, output: 2175.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:45<00:13, 57.01it/s, est. speed input: 4968.74 toks/s, output: 2250.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:45<00:08, 89.29it/s, est. speed input: 5216.31 toks/s, output: 2387.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:45<00:07, 103.78it/s, est. speed input: 5404.92 toks/s, output: 2494.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:46<00:07, 94.32it/s, est. speed input: 5529.05 toks/s, output: 2565.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:46<00:07, 93.78it/s, est. speed input: 5652.74 toks/s, output: 2653.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:07, 99.75it/s, est. speed input: 5789.48 toks/s, output: 2729.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:46<00:06, 100.67it/s, est. speed input: 6010.04 toks/s, output: 2850.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:46<00:05, 126.59it/s, est. speed input: 6278.53 toks/s, output: 3042.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:46<00:05, 123.05it/s, est. speed input: 6445.50 toks/s, output: 3166.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:47<00:06, 93.95it/s, est. speed input: 6554.23 toks/s, output: 3225.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:47<00:07, 79.94it/s, est. speed input: 6669.98 toks/s, output: 3297.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:47<00:08, 67.96it/s, est. speed input: 6728.12 toks/s, output: 3341.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:47<00:06, 86.13it/s, est. speed input: 6905.89 toks/s, output: 3465.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:47<00:05, 103.61it/s, est. speed input: 7086.70 toks/s, output: 3581.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:48<00:05, 105.53it/s, est. speed input: 7210.83 toks/s, output: 3678.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:48<00:04, 108.66it/s, est. speed input: 7372.43 toks/s, output: 3782.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:48<00:04, 99.20it/s, est. speed input: 7491.93 toks/s, output: 3868.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:48<00:04, 101.26it/s, est. speed input: 7638.88 toks/s, output: 3963.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:48<00:04, 103.04it/s, est. speed input: 7756.64 toks/s, output: 4048.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:48<00:03, 114.40it/s, est. speed input: 7925.17 toks/s, output: 4151.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:49<00:04, 100.51it/s, est. speed input: 8043.14 toks/s, output: 4233.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:49<00:03, 121.03it/s, est. speed input: 8231.62 toks/s, output: 4334.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:49<00:03, 98.95it/s, est. speed input: 8340.64 toks/s, output: 4425.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:03, 102.43it/s, est. speed input: 8456.56 toks/s, output: 4537.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:49<00:03, 103.60it/s, est. speed input: 8575.17 toks/s, output: 4617.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:49<00:02, 128.73it/s, est. speed input: 8782.14 toks/s, output: 4769.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:49<00:02, 143.00it/s, est. speed input: 8941.89 toks/s, output: 4884.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:50<00:02, 117.07it/s, est. speed input: 9078.24 toks/s, output: 5000.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:50<00:02, 115.08it/s, est. speed input: 9198.31 toks/s, output: 5090.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:50<00:02, 95.40it/s, est. speed input: 9291.80 toks/s, output: 5167.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:50<00:02, 85.39it/s, est. speed input: 9380.95 toks/s, output: 5256.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:50<00:03, 81.30it/s, est. speed input: 9445.54 toks/s, output: 5312.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:51<00:02, 103.86it/s, est. speed input: 9616.98 toks/s, output: 5436.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:51<00:01, 117.94it/s, est. speed input: 9800.40 toks/s, output: 5616.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:51<00:01, 132.25it/s, est. speed input: 9960.39 toks/s, output: 5771.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:51<00:01, 111.01it/s, est. speed input: 10054.46 toks/s, output: 5866.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:01, 108.74it/s, est. speed input: 10163.93 toks/s, output: 5985.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:51<00:01, 79.26it/s, est. speed input: 10239.53 toks/s, output: 6074.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:52<00:01, 79.46it/s, est. speed input: 10333.65 toks/s, output: 6186.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:52<00:01, 71.17it/s, est. speed input: 10378.02 toks/s, output: 6241.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:52<00:01, 63.99it/s, est. speed input: 10424.93 toks/s, output: 6305.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:52<00:01, 60.31it/s, est. speed input: 10473.29 toks/s, output: 6378.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:52<00:01, 61.36it/s, est. speed input: 10526.99 toks/s, output: 6437.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:53<00:01, 60.38it/s, est. speed input: 10578.00 toks/s, output: 6503.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:53<00:01, 53.79it/s, est. speed input: 10614.62 toks/s, output: 6584.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:53<00:00, 51.04it/s, est. speed input: 10659.14 toks/s, output: 6662.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:54<00:01, 33.37it/s, est. speed input: 10635.74 toks/s, output: 6676.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:54<00:00, 47.09it/s, est. speed input: 10745.95 toks/s, output: 6806.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 28.44it/s, est. speed input: 10681.35 toks/s, output: 6814.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:55<00:00, 28.39it/s, est. speed input: 10697.62 toks/s, output: 6871.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:55<00:00, 20.78it/s, est. speed input: 10629.52 toks/s, output: 6861.44 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:55<00:00, 22.91it/s, est. speed input: 10629.52 toks/s, output: 6861.44 toks/s]
[36m(Runner pid=3309020)[0m balanced_max: 994157
[36m(Runner pid=3309020)[0m balanced_min: 994156
[36m(Runner pid=3309020)[0m max: 1009287
[36m(Runner pid=3309020)[0m mean: 994156.5
[36m(Runner pid=3309020)[0m min: 979026
[36m(Runner pid=3309020)[0m minmax_diff: 30261
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.357
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.122
[36m(Runner pid=3309020)[0m throughput: 1163.441
[36m(Runner pid=3309020)[0m time_per_step: 854.496
[36m(Runner pid=3309020)[0m total_num_tokens: 1988313
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 671.0
[36m(Runner pid=3309020)[0m mean: 469.992
[36m(Runner pid=3309020)[0m min: 413.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1580.0
[36m(Runner pid=3309020)[0m mean: 306.693
[36m(Runner pid=3309020)[0m min: 58.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.316
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.657
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.418311501904219e-05
[36m(Runner pid=3309020)[0m gen: 0.138
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.283
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.167
[36m(Runner pid=3309020)[0m gen: 108.108
[36m(Runner pid=3309020)[0m old: 87.733
[36m(Runner pid=3309020)[0m ref: 87.921
[36m(Runner pid=3309020)[0m reward: 6.561
[36m(Runner pid=3309020)[0m step: 854.496
[36m(Runner pid=3309020)[0m update_actor: 563.384
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 45; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:26:12 [executor_base.py:219] It took 0.339980 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:27:39 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:26:12 [executor_base.py:219] It took 0.340584 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:27:39 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:27:39 [executor_base.py:208] It took 0.327147 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.77 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.85 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:27:40 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:27:40 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:27:40 [executor_base.py:208] It took 0.325480 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.000281608197838068, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.065224789083004, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007890518172644079}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0003301141259726137, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0023143377620726824}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.08095293492078781, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009537509176880121}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5974672436714172, 'actor/pg_clipfrac': 0.0011876485077664256, 'actor/ppo_kl': 0.0006836519460193813}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.04059305414557457, 'actor/pg_clipfrac': 0.0029411765281111, 'actor/ppo_kl': 0.0003680360096041113}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.11759229749441147, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0001275511021958664, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.6800232529640198, 'actor/pg_clipfrac': 0.00042753314482979476, 'actor/ppo_kl': -0.00015371321933344007}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002805480908136815, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012938446598127484}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003530188696458936, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00012795047950930893}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.12349508702754974, 'actor/pg_clipfrac': 0.0020120723638683558, 'actor/ppo_kl': -0.001619319780729711}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.17777402698993683, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.13662020862102509, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002684251812752336}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.13280607759952545, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0003390708880033344, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.14286506175994873, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012485927436500788}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.410746306180954, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000568904506508261}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.33250972628593445, 'actor/pg_clipfrac': 0.0010638297535479069, 'actor/ppo_kl': 0.0014674531994387507}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.2285528928041458, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005832871538586915}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00028303894214332104, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011555732926353812}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.31601157784461975, 'actor/pg_clipfrac': 0.0026490066666156054, 'actor/ppo_kl': 1.6116464394144714e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2870946526527405, 'actor/pg_clipfrac': 0.0009208103292621672, 'actor/ppo_kl': -0.0015474867541342974}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2312801033258438, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010802049655467272}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.447842538356781, 'actor/pg_clipfrac': 0.002211166312918067, 'actor/ppo_kl': 0.00014097720850259066}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.25255057215690613, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005953410291112959}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00047476819599978626, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002036621328443289}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.25869935750961304, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005986650940030813}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.04994802549481392, 'actor/pg_clipfrac': 0.0030706243123859167, 'actor/ppo_kl': 0.0005120245041325688}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.29019680619239807, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00032030532020144165}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.010401738807559013, 'actor/pg_clipfrac': 0.0008561643771827221, 'actor/ppo_kl': -0.0011157173430547118}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.21945859491825104, 'actor/pg_clipfrac': 0.000994035741314292, 'actor/ppo_kl': 0.001187591813504696}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.07015601545572281, 'actor/pg_clipfrac': 0.0018399263499304652, 'actor/ppo_kl': -0.0018211582209914923}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00022018252639099956, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00022743191220797598}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3725644052028656, 'actor/pg_clipfrac': 0.0006662225350737572, 'actor/ppo_kl': -0.0014784925151616335}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.27895981073379517, 'actor/pg_clipfrac': 0.0015479875728487968, 'actor/ppo_kl': 0.00080857117427513}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.10234612226486206, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004758623836096376}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3684495985507965, 'actor/pg_clipfrac': 0.007149240467697382, 'actor/ppo_kl': -0.00025963803636841476}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.028184320777654648, 'actor/pg_clipfrac': 0.0011961722047999501, 'actor/ppo_kl': 0.0016549420543015003}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00033418380189687014, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012473523383960128}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.4442571997642517, 'actor/pg_clipfrac': 0.0034275921061635017, 'actor/ppo_kl': -0.0011593083618208766}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.08384563028812408, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007640481926500797}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2085074484348297, 'actor/pg_clipfrac': 0.0008058017701841891, 'actor/ppo_kl': 0.0022651017643511295}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.3423970937728882, 'actor/pg_clipfrac': 0.0013175230706110597, 'actor/ppo_kl': 0.001764073851518333}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.03487912937998772, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.58264068281278e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.13424500823020935, 'actor/pg_clipfrac': 0.0028409091755747795, 'actor/ppo_kl': 0.0003045472258236259}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.2761896848678589, 'actor/pg_clipfrac': 0.001166180707514286, 'actor/ppo_kl': -0.0005260590114630759}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.06946783512830734, 'actor/pg_clipfrac': 0.0018867924809455872, 'actor/ppo_kl': 0.00035331654362380505}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.24164029955863953, 'actor/pg_clipfrac': 0.0022547913249582052, 'actor/ppo_kl': 0.0009417313267476857}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.07876195758581161, 'actor/pg_clipfrac': 0.0030030030757188797, 'actor/ppo_kl': -1.9737908587558195e-05}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.16007326543331146, 'actor/pg_clipfrac': 0.0015337422955781221, 'actor/ppo_kl': -0.0008868059376254678}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00033038415131159127, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009523387998342514}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.2393401563167572, 'actor/pg_clipfrac': 0.0014430014416575432, 'actor/ppo_kl': 0.0006994354771450162}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.30939242243766785, 'actor/pg_clipfrac': 0.0017391304718330503, 'actor/ppo_kl': 0.0009321461548097432}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00018957177235279232, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009870213689282537}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00031730069895274937, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00036872384953312576}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00023670132213737816, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 2.888463246364381e-09}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.1879786103963852, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013584241969510913}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5761919021606445, 'actor/pg_clipfrac': 0.0013227512827143073, 'actor/ppo_kl': -0.00013272471551317722}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5394213199615479, 'actor/pg_clipfrac': 0.001437814556993544, 'actor/ppo_kl': -0.0007004041690379381}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002516711538191885, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -6.689285510219634e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.06312783062458038, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005086932796984911}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.7880194783210754, 'actor/pg_clipfrac': 0.0007047216058708727, 'actor/ppo_kl': 8.499866089550778e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.0879882350564003, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003156678285449743}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:37:53, 15.62s/it, est. speed input: 30.15 toks/s, output: 6.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<40:41, 6.51s/it, est. speed input: 59.22 toks/s, output: 12.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 4/377 [00:16<15:44, 2.53s/it, est. speed input: 113.60 toks/s, output: 25.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 6/377 [00:16<08:29, 1.37s/it, est. speed input: 170.60 toks/s, output: 39.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 10/377 [00:16<03:41, 1.66it/s, est. speed input: 281.13 toks/s, output: 66.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 13/377 [00:16<02:22, 2.56it/s, est. speed input: 361.76 toks/s, output: 88.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 16/377 [00:16<01:38, 3.68it/s, est. speed input: 441.81 toks/s, output: 109.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 19/377 [00:16<01:11, 5.02it/s, est. speed input: 520.09 toks/s, output: 131.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 21/377 [00:16<01:00, 5.86it/s, est. speed input: 569.44 toks/s, output: 146.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 26/377 [00:17<00:36, 9.66it/s, est. speed input: 699.70 toks/s, output: 184.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 29/377 [00:17<00:31, 11.22it/s, est. speed input: 774.61 toks/s, output: 207.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 32/377 [00:17<00:30, 11.38it/s, est. speed input: 841.43 toks/s, output: 231.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 37/377 [00:17<00:21, 16.00it/s, est. speed input: 964.48 toks/s, output: 272.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 42/377 [00:17<00:16, 20.48it/s, est. speed input: 1089.60 toks/s, output: 315.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 49/377 [00:17<00:12, 25.33it/s, est. speed input: 1257.63 toks/s, output: 377.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 55/377 [00:18<00:10, 30.29it/s, est. speed input: 1403.86 toks/s, output: 431.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 63/377 [00:18<00:08, 36.46it/s, est. speed input: 1596.46 toks/s, output: 503.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 71/377 [00:18<00:07, 43.50it/s, est. speed input: 1786.90 toks/s, output: 574.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 80/377 [00:18<00:05, 51.49it/s, est. speed input: 2004.80 toks/s, output: 658.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 87/377 [00:18<00:05, 53.52it/s, est. speed input: 2164.77 toks/s, output: 722.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 94/377 [00:18<00:05, 55.38it/s, est. speed input: 2324.52 toks/s, output: 788.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 102/377 [00:18<00:04, 59.22it/s, est. speed input: 2508.87 toks/s, output: 865.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 109/377 [00:18<00:04, 56.01it/s, est. speed input: 2671.09 toks/s, output: 933.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 115/377 [00:19<00:04, 55.37it/s, est. speed input: 2802.55 toks/s, output: 991.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 123/377 [00:19<00:04, 60.37it/s, est. speed input: 2979.88 toks/s, output: 1071.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 131/377 [00:19<00:03, 62.22it/s, est. speed input: 3156.11 toks/s, output: 1151.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 138/377 [00:19<00:04, 54.88it/s, est. speed input: 3296.93 toks/s, output: 1219.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 148/377 [00:19<00:03, 63.25it/s, est. speed input: 3517.70 toks/s, output: 1324.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 156/377 [00:19<00:03, 64.80it/s, est. speed input: 3687.11 toks/s, output: 1407.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 163/377 [00:19<00:03, 63.58it/s, est. speed input: 3831.91 toks/s, output: 1481.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 170/377 [00:19<00:03, 63.67it/s, est. speed input: 3974.51 toks/s, output: 1556.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 177/377 [00:20<00:03, 63.66it/s, est. speed input: 4117.05 toks/s, output: 1632.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 184/377 [00:20<00:02, 64.39it/s, est. speed input: 4261.88 toks/s, output: 1710.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 191/377 [00:20<00:02, 65.63it/s, est. speed input: 4402.31 toks/s, output: 1788.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 199/377 [00:20<00:02, 66.12it/s, est. speed input: 4564.10 toks/s, output: 1879.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 206/377 [00:20<00:03, 53.09it/s, est. speed input: 4678.80 toks/s, output: 1951.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 216/377 [00:20<00:02, 61.49it/s, est. speed input: 4886.55 toks/s, output: 2073.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 225/377 [00:20<00:02, 65.67it/s, est. speed input: 5064.67 toks/s, output: 2181.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 233/377 [00:20<00:02, 67.10it/s, est. speed input: 5217.26 toks/s, output: 2277.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 243/377 [00:21<00:02, 59.98it/s, est. speed input: 5388.73 toks/s, output: 2393.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 250/377 [00:21<00:02, 56.93it/s, est. speed input: 5510.78 toks/s, output: 2476.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 256/377 [00:21<00:02, 56.14it/s, est. speed input: 5613.76 toks/s, output: 2550.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 262/377 [00:21<00:02, 52.07it/s, est. speed input: 5713.69 toks/s, output: 2623.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 270/377 [00:21<00:01, 56.35it/s, est. speed input: 5867.43 toks/s, output: 2730.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 277/377 [00:21<00:01, 56.39it/s, est. speed input: 5986.07 toks/s, output: 2822.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 284/377 [00:21<00:01, 58.56it/s, est. speed input: 6108.34 toks/s, output: 2918.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 291/377 [00:21<00:01, 58.72it/s, est. speed input: 6227.16 toks/s, output: 3015.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 297/377 [00:22<00:01, 53.11it/s, est. speed input: 6314.34 toks/s, output: 3093.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 303/377 [00:22<00:01, 48.06it/s, est. speed input: 6401.08 toks/s, output: 3173.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 314/377 [00:22<00:01, 54.81it/s, est. speed input: 6593.93 toks/s, output: 3337.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 322/377 [00:22<00:00, 58.19it/s, est. speed input: 6726.33 toks/s, output: 3462.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 329/377 [00:22<00:00, 58.30it/s, est. speed input: 6837.20 toks/s, output: 3569.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 335/377 [00:22<00:00, 55.22it/s, est. speed input: 6925.21 toks/s, output: 3660.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 341/377 [00:22<00:00, 48.71it/s, est. speed input: 7002.63 toks/s, output: 3747.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 346/377 [00:23<00:00, 33.65it/s, est. speed input: 7016.52 toks/s, output: 3798.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 351/377 [00:23<00:00, 29.93it/s, est. speed input: 7049.37 toks/s, output: 3866.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 355/377 [00:23<00:00, 22.01it/s, est. speed input: 7026.66 toks/s, output: 3896.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 358/377 [00:23<00:00, 21.70it/s, est. speed input: 7043.54 toks/s, output: 3940.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:24<00:00, 19.11it/s, est. speed input: 7041.95 toks/s, output: 3973.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 364/377 [00:25<00:01, 7.00it/s, est. speed input: 6738.49 toks/s, output: 3851.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [00:26<00:01, 5.52it/s, est. speed input: 6595.49 toks/s, output: 3807.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [00:27<00:02, 3.62it/s, est. speed input: 6325.83 toks/s, output: 3696.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 369/377 [00:55<00:31, 3.89s/it, est. speed input: 3131.80 toks/s, output: 1900.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [01:04<00:32, 4.70s/it, est. speed input: 2700.84 toks/s, output: 1715.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [01:07<00:25, 4.28s/it, est. speed input: 2611.90 toks/s, output: 1736.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [01:07<00:07, 2.42s/it, est. speed input: 2628.81 toks/s, output: 1978.93 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:07<00:00, 5.61it/s, est. speed input: 2649.82 toks/s, output: 2223.73 toks/s]
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:40:53 [executor_base.py:219] It took 0.339054 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:42:44 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:40:53 [executor_base.py:219] It took 0.339760 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:42:44 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:42:44 [executor_base.py:208] It took 0.327823 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:42:53 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:42:54 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:42:54 [executor_base.py:208] It took 0.325780 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to understand the geometric properties and the constraints given in the problem. Since D is the midpoint of AB and BD = 2.0, it follows that AD = 2.0 as well. The line l is the vertical line passing through D, and E is any point on l. The perimeter of triangle AEC is the sum of the lengths of sides AE, EC, and AC.\n\nSince E is on the line l, the distance from E to A (AE) is constant and equal to AD, which is 2.0. The distance from E to C (EC) will be minimized when E is directly below C on the line l. This means that EC is the same length as the distance from C to the line l, which is the perpendicular distance from C to the line l.\n\nThe minimum perimeter of triangle AEC occurs when E is directly below C on the line l, making EC = AC. Therefore, the perimeter of triangle AEC is minimized when AE + EC is minimized, which is the same as AD + AC.\n\nSince the perimeter of triangle ABC is 16.0 and AD = 2.0, it follows that AC + AB = 14.0. The minimum perimeter of triangle AEC is then AC + AD = AC + 2.0.\n\nSince AC + AB = 14.0 and AB = 2AD = 4.0, it follows that AC = 10.0. Therefore, the minimum perimeter of triangle AEC is AC + AD = 10.0 + 2.0 = 12.0.\n\nTherefore, the minimum perimeter of triangle AEC is 12.0.\n\n\n12.0\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated 90 degrees clockwise around point A to form triangle AB'C', we know that angle BAC is 90 degrees. Since the rotation is 90 degrees, angle B'AC' is also 90 degrees. \n\nGiven angle CC'B' = 32.0 degrees, we can deduce that angle AC'B' is the complement of angle CC'B' because the sum of angles in a triangle is 180 degrees. Therefore, angle AC'B' = 180 - 90 - 32.0 = 58.0 degrees.\n\nHowever, since triangle AB'C' is formed by a 90-degree clockwise rotation of triangle ABC, angle AC'B' is actually the same as angle AC'B (since they are the same angle measured from the same point and direction).\n\nSo, angle AC'B = 58.0 degrees.\n\n\nThe size of angle AC'B' is 58.0 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo solve for the degree of angle AEB, we need to use properties of circles and parallelograms. Since A, B, and D are vertices of a parallelogram and all lie on the circle, we can infer that angle ADC is equal to angle ABC because opposite angles in a parallelogram are equal. Given that angle ADC = 54°, angle ABC = 54°. Since BE is the diameter of the circle, angle BAE = 90° (as any angle subtended by a diameter in a circle is a right angle). Therefore, angle AEB = 90° - angle ABC = 90° - 54° = 36°.\n\n\nThe degree of angle AEB is 36°.\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo solve this problem, we need to understand the properties of a circle and the relationships between angles and tangents. Since AB is the diameter of circle O, angle ADB is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Given that angle A = 35 degrees, we can find angle D by subtracting angle A from 90 degrees, which gives us angle D = 90 - 35 = 55 degrees.\n\nSince CD is a tangent to the circle at point D, angle ODC is also a right angle (90 degrees). Therefore, angle C can be found by subtracting angle D from 90 degrees, which gives us angle C = 90 - 55 = 35 degrees.\n\n\n35 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, angle ACD is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Since EA is a tangent to the circle at point A, angle EAC is given as 120 degrees. We can use these facts to find angle ABC.\n\nFirst, we find angle CAD:\n\\[ \\text{angle CAD} = 90^\\circ - \\text{angle EAC} = 90^\\circ - 120^\\circ \\]\nHowever, this calculation seems incorrect because we cannot subtract 120 degrees from 90 degrees. The correct interpretation is that angle EAC is an external angle to the right angle at A, which means it is supplementary to the right angle. Therefore, angle CAD should be:\n\\[ \\text{angle CAD} = 180^\\circ - 90^\\circ - 120^\\circ = 30^\\circ \\]\n\nNow, angle ABC can be found by noting that angle ABC is the same as angle CAD because they are both angles in the same triangle ABC and subtended by the same arc AC. Therefore:\n\\[ \\text{angle ABC} = \\text{angle CAD} = 30^\\circ \\]\n\nThus, the degree of angle ABC is 30.\n\n\n30\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_30
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_45/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_45/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_45/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 45
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.258
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.039
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.011
[36m(Runner pid=3309020)[0m ppo_kl: 2.2494884075497517e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.018
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.018
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.654
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.654
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 978137
[36m(Runner pid=3309020)[0m balanced_min: 978136
[36m(Runner pid=3309020)[0m max: 979244
[36m(Runner pid=3309020)[0m mean: 978136.5
[36m(Runner pid=3309020)[0m min: 977029
[36m(Runner pid=3309020)[0m minmax_diff: 2215
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.643
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.12
[36m(Runner pid=3309020)[0m throughput: 930.021
[36m(Runner pid=3309020)[0m time_per_step: 1051.736
[36m(Runner pid=3309020)[0m total_num_tokens: 1956273
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 693.0
[36m(Runner pid=3309020)[0m mean: 466.252
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1130.0
[36m(Runner pid=3309020)[0m mean: 297.917
[36m(Runner pid=3309020)[0m min: 61.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.311
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.654
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.911319481918035e-05
[36m(Runner pid=3309020)[0m gen: 0.134
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.288
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.174
[36m(Runner pid=3309020)[0m gen: 101.826
[36m(Runner pid=3309020)[0m old: 87.527
[36m(Runner pid=3309020)[0m ref: 88.159
[36m(Runner pid=3309020)[0m reward: 6.469
[36m(Runner pid=3309020)[0m save_checkpoint: 30.906
[36m(Runner pid=3309020)[0m step: 1051.736
[36m(Runner pid=3309020)[0m update_actor: 562.491
[36m(Runner pid=3309020)[0m validation: 173.441
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.383
[36m(Runner pid=3309020)[0m format_reward: 0.98
[36m(Runner pid=3309020)[0m overall_reward: 0.683
[36m(Runner pid=3309020)[0m reward_score: 0.683
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.985
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_45/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_45/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:16<1:10:09, 3.30s/it, est. speed input: 139.94 toks/s, output: 24.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<52:52, 2.50s/it, est. speed input: 173.78 toks/s, output: 38.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:28<32:53, 1.56s/it, est. speed input: 243.24 toks/s, output: 59.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<20:43, 1.01it/s, est. speed input: 314.51 toks/s, output: 78.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<10:54, 1.91it/s, est. speed input: 456.04 toks/s, output: 121.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<10:46, 1.93it/s, est. speed input: 489.53 toks/s, output: 131.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<08:14, 2.51it/s, est. speed input: 551.55 toks/s, output: 152.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<05:01, 4.07it/s, est. speed input: 679.41 toks/s, output: 194.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:33<03:10, 6.41it/s, est. speed input: 808.05 toks/s, output: 236.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:33<02:07, 9.46it/s, est. speed input: 939.32 toks/s, output: 280.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:34<01:33, 12.85it/s, est. speed input: 1068.66 toks/s, output: 321.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:34<01:25, 13.98it/s, est. speed input: 1128.14 toks/s, output: 346.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:34<01:36, 12.32it/s, est. speed input: 1173.93 toks/s, output: 363.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:35<01:22, 14.25it/s, est. speed input: 1284.64 toks/s, output: 401.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:35<01:12, 16.28it/s, est. speed input: 1339.91 toks/s, output: 422.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<01:23, 14.03it/s, est. speed input: 1385.76 toks/s, output: 438.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:36<01:22, 14.07it/s, est. speed input: 1434.67 toks/s, output: 458.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:36<01:26, 13.41it/s, est. speed input: 1480.97 toks/s, output: 476.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<01:11, 16.18it/s, est. speed input: 1538.37 toks/s, output: 495.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:50, 22.80it/s, est. speed input: 1656.48 toks/s, output: 535.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:37<00:36, 31.03it/s, est. speed input: 1778.42 toks/s, output: 576.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:37<00:35, 32.20it/s, est. speed input: 1833.71 toks/s, output: 600.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:51, 21.83it/s, est. speed input: 1872.93 toks/s, output: 617.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:38<00:53, 21.02it/s, est. speed input: 1921.26 toks/s, output: 635.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:38<00:45, 24.15it/s, est. speed input: 2023.02 toks/s, output: 674.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:38<00:41, 26.59it/s, est. speed input: 2076.66 toks/s, output: 694.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:31, 34.80it/s, est. speed input: 2237.93 toks/s, output: 774.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:39<00:18, 56.66it/s, est. speed input: 2465.99 toks/s, output: 870.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:39<00:18, 56.74it/s, est. speed input: 2568.76 toks/s, output: 920.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:39<00:18, 57.12it/s, est. speed input: 2672.01 toks/s, output: 968.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:39<00:18, 57.40it/s, est. speed input: 2778.38 toks/s, output: 1015.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:39<00:17, 57.82it/s, est. speed input: 2878.49 toks/s, output: 1057.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:40<00:20, 48.98it/s, est. speed input: 2972.21 toks/s, output: 1107.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:40<00:21, 47.81it/s, est. speed input: 3073.52 toks/s, output: 1153.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:40<00:19, 51.08it/s, est. speed input: 3177.86 toks/s, output: 1199.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:40<00:17, 57.53it/s, est. speed input: 3330.42 toks/s, output: 1276.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:40<00:15, 62.02it/s, est. speed input: 3486.40 toks/s, output: 1354.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:41<00:17, 54.05it/s, est. speed input: 3579.78 toks/s, output: 1405.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:41<00:15, 61.46it/s, est. speed input: 3679.30 toks/s, output: 1454.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:41<00:17, 53.43it/s, est. speed input: 3761.14 toks/s, output: 1501.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:41<00:15, 61.39it/s, est. speed input: 3859.25 toks/s, output: 1543.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:41<00:18, 49.55it/s, est. speed input: 3943.26 toks/s, output: 1577.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:42<00:15, 57.18it/s, est. speed input: 4094.33 toks/s, output: 1655.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:42<00:16, 55.56it/s, est. speed input: 4179.52 toks/s, output: 1698.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:42<00:15, 58.70it/s, est. speed input: 4271.21 toks/s, output: 1756.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:42<00:14, 60.95it/s, est. speed input: 4408.85 toks/s, output: 1821.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:42<00:07, 114.75it/s, est. speed input: 4836.52 toks/s, output: 2021.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:42<00:07, 115.32it/s, est. speed input: 4980.39 toks/s, output: 2104.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:43<00:08, 92.66it/s, est. speed input: 5115.18 toks/s, output: 2181.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:43<00:09, 80.95it/s, est. speed input: 5248.80 toks/s, output: 2255.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:43<00:09, 81.04it/s, est. speed input: 5333.12 toks/s, output: 2299.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:43<00:09, 78.87it/s, est. speed input: 5466.79 toks/s, output: 2354.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:44<00:11, 66.08it/s, est. speed input: 5539.87 toks/s, output: 2378.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:44<00:12, 57.96it/s, est. speed input: 5615.25 toks/s, output: 2427.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:44<00:12, 60.17it/s, est. speed input: 5740.72 toks/s, output: 2494.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:44<00:11, 61.28it/s, est. speed input: 5828.21 toks/s, output: 2549.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:44<00:11, 62.88it/s, est. speed input: 5914.76 toks/s, output: 2590.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:44<00:11, 61.33it/s, est. speed input: 5990.63 toks/s, output: 2640.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:45<00:10, 68.30it/s, est. speed input: 6077.73 toks/s, output: 2701.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:45<00:09, 74.89it/s, est. speed input: 6170.70 toks/s, output: 2750.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:45<00:08, 80.27it/s, est. speed input: 6304.42 toks/s, output: 2832.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:45<00:06, 94.03it/s, est. speed input: 6490.95 toks/s, output: 2946.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:45<00:06, 96.31it/s, est. speed input: 6706.78 toks/s, output: 3099.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:45<00:06, 93.33it/s, est. speed input: 6789.74 toks/s, output: 3142.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:46<00:09, 62.03it/s, est. speed input: 6839.89 toks/s, output: 3179.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:46<00:09, 58.70it/s, est. speed input: 6915.07 toks/s, output: 3222.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:46<00:09, 58.86it/s, est. speed input: 6985.24 toks/s, output: 3276.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:46<00:07, 74.25it/s, est. speed input: 7158.67 toks/s, output: 3401.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:46<00:07, 75.93it/s, est. speed input: 7239.56 toks/s, output: 3458.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:47<00:05, 87.23it/s, est. speed input: 7453.88 toks/s, output: 3594.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:47<00:05, 95.48it/s, est. speed input: 7586.71 toks/s, output: 3667.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:47<00:05, 91.69it/s, est. speed input: 7713.24 toks/s, output: 3765.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:47<00:05, 88.16it/s, est. speed input: 7788.23 toks/s, output: 3833.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:47<00:05, 78.88it/s, est. speed input: 7855.64 toks/s, output: 3883.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:47<00:05, 79.65it/s, est. speed input: 7934.28 toks/s, output: 3936.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:47<00:04, 91.09it/s, est. speed input: 8060.58 toks/s, output: 3996.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:48<00:04, 96.18it/s, est. speed input: 8182.49 toks/s, output: 4082.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:48<00:03, 126.55it/s, est. speed input: 8401.01 toks/s, output: 4235.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:48<00:02, 131.82it/s, est. speed input: 8575.12 toks/s, output: 4358.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:48<00:03, 119.75it/s, est. speed input: 8693.58 toks/s, output: 4442.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:48<00:03, 97.79it/s, est. speed input: 8795.63 toks/s, output: 4534.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:48<00:03, 98.50it/s, est. speed input: 8917.02 toks/s, output: 4636.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:49<00:04, 79.92it/s, est. speed input: 9042.10 toks/s, output: 4769.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:49<00:03, 84.04it/s, est. speed input: 9160.43 toks/s, output: 4871.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:49<00:03, 80.34it/s, est. speed input: 9220.30 toks/s, output: 4923.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:49<00:03, 78.81it/s, est. speed input: 9325.96 toks/s, output: 5015.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:49<00:03, 88.27it/s, est. speed input: 9435.08 toks/s, output: 5118.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:49<00:02, 89.11it/s, est. speed input: 9508.35 toks/s, output: 5187.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:50<00:03, 66.48it/s, est. speed input: 9549.62 toks/s, output: 5228.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:50<00:02, 88.14it/s, est. speed input: 9707.60 toks/s, output: 5366.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:50<00:03, 64.24it/s, est. speed input: 9766.80 toks/s, output: 5448.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:50<00:03, 58.13it/s, est. speed input: 9816.45 toks/s, output: 5482.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:51<00:02, 79.06it/s, est. speed input: 9992.02 toks/s, output: 5642.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:51<00:01, 87.15it/s, est. speed input: 10116.33 toks/s, output: 5750.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:01, 91.03it/s, est. speed input: 10221.08 toks/s, output: 5876.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:51<00:02, 59.59it/s, est. speed input: 10258.18 toks/s, output: 5945.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:52<00:02, 50.29it/s, est. speed input: 10288.26 toks/s, output: 6005.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:52<00:02, 55.70it/s, est. speed input: 10352.16 toks/s, output: 6069.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:52<00:01, 68.78it/s, est. speed input: 10461.95 toks/s, output: 6174.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:52<00:01, 65.00it/s, est. speed input: 10548.21 toks/s, output: 6304.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:52<00:01, 57.33it/s, est. speed input: 10588.92 toks/s, output: 6381.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:53<00:01, 43.35it/s, est. speed input: 10598.61 toks/s, output: 6433.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:53<00:01, 47.92it/s, est. speed input: 10658.79 toks/s, output: 6504.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:53<00:00, 51.08it/s, est. speed input: 10715.94 toks/s, output: 6615.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:53<00:00, 42.13it/s, est. speed input: 10738.60 toks/s, output: 6693.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:54<00:00, 49.54it/s, est. speed input: 10804.41 toks/s, output: 6791.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 23.78it/s, est. speed input: 10705.81 toks/s, output: 6786.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:55<00:00, 25.46it/s, est. speed input: 10724.13 toks/s, output: 6819.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:55<00:00, 15.17it/s, est. speed input: 10599.08 toks/s, output: 6777.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:10<00:00, 1.48it/s, est. speed input: 8486.08 toks/s, output: 5457.93 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:10<00:00, 18.24it/s, est. speed input: 8486.08 toks/s, output: 5457.93 toks/s]
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_45/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m Training Episode 3.
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 46; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:44:10 [executor_base.py:219] It took 0.343386 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:45:35 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:44:10 [executor_base.py:219] It took 0.343041 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:45:35 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:45:35 [executor_base.py:208] It took 0.328719 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:45:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:45:58 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:45:58 [executor_base.py:208] It took 0.326335 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0003800225676968694, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.000291835778625682, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009091436513699591}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.07318838685750961, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00016909422993194312}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4187702536582947, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.6023392677307129, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004335195990279317}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.10838743299245834, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00025586874107830226, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005681905895471573}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.37175536155700684, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2653006613254547, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0003288913576398045, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3012937605381012, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012872838415205479}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.12417767196893692, 'actor/pg_clipfrac': 0.0028943559154868126, 'actor/ppo_kl': 0.0004572778707370162}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00020264953491277993, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005525285378098488}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2579335868358612, 'actor/pg_clipfrac': 0.00162601622287184, 'actor/ppo_kl': -0.002023541834205389}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.28522372245788574, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4495236873626709, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.05711271986365318, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011092672357335687}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.12789669632911682, 'actor/pg_clipfrac': 0.0015313936164602637, 'actor/ppo_kl': -0.001157407183200121}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00028642817051149905, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009556520963087678}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.40652400255203247, 'actor/pg_clipfrac': 0.0007052186410874128, 'actor/ppo_kl': -0.00011537576210685074}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0004340567975305021, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003790906921494752}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00022174841433297843, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.003856954863294959}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1823669970035553, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018245637184008956}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00012965104542672634, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00018478435231372714}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.045131783932447433, 'actor/pg_clipfrac': 0.001775147975422442, 'actor/ppo_kl': 0.00010387827205704525}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.15457184612751007, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00013097928604111075}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00017956468218471855, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006497952854260802}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00025167976855300367, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000683697231579572}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00036268861731514335, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005342436488717794}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.17852188646793365, 'actor/pg_clipfrac': 0.0006671114242635667, 'actor/ppo_kl': -0.000497366301715374}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.05865031108260155, 'actor/pg_clipfrac': 0.001768346643075347, 'actor/ppo_kl': 0.00024321646196767688}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.05876386910676956, 'actor/pg_clipfrac': 0.0007886435487307608, 'actor/ppo_kl': -0.00019934200099669397}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002377552882535383, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009317180374637246}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00018426537280902267, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008147152839228511}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.11803759634494781, 'actor/pg_clipfrac': 0.0021857924293726683, 'actor/ppo_kl': -0.000787576544098556}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.5012291073799133, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004891709540970623}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002571093791630119, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010483803926035762}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00013115750334691256, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00013401084288489074}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.017834411934018135, 'actor/pg_clipfrac': 0.002926115645095706, 'actor/ppo_kl': 0.00044727674685418606}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.4043445885181427, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011156653054058552}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.10715057700872421, 'actor/pg_clipfrac': 0.0005376344197429717, 'actor/ppo_kl': -0.0010328969219699502}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.2842412292957306, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001395922590745613}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00030731450533494353, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00017056115029845387}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.36675918102264404, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00043613548041321337}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0006482438766397536, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002062160288915038}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.20352095365524292, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008013327606022358}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6834880709648132, 'actor/pg_clipfrac': 0.004597701132297516, 'actor/ppo_kl': 1.1856254786835052e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.5139389038085938, 'actor/pg_clipfrac': 0.0009250693838112056, 'actor/ppo_kl': 0.00012746770516969264}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00026352234999649227, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009140546317212284}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.13030336797237396, 'actor/pg_clipfrac': 0.003948667552322149, 'actor/ppo_kl': 0.00016334849351551384}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2606174051761627, 'actor/pg_clipfrac': 0.0007645260193385184, 'actor/ppo_kl': -0.0014129705959931016}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.24639183282852173, 'actor/pg_clipfrac': 0.00358422938734293, 'actor/ppo_kl': 0.0010614377679303288}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00033731391886249185, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011170379584655166}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.6951855421066284, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007826796500012279}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.13045398890972137, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00041891937144100666}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.031736262142658234, 'actor/pg_clipfrac': 0.0025348542258143425, 'actor/ppo_kl': 0.0012193155707791448}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.5161186456680298, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007393396808765829}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00026736268773674965, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00026978261303156614}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003017251146957278, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011580507270991802}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0036662272177636623, 'actor/pg_clipfrac': 0.0032552082557231188, 'actor/ppo_kl': -0.00017256166029255837}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.42936986684799194, 'actor/pg_clipfrac': 0.0024232633877545595, 'actor/ppo_kl': 6.665089313173667e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.2864355742931366, 'actor/pg_clipfrac': 0.003530450165271759, 'actor/ppo_kl': 0.0011084523284807801}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.11644559353590012, 'actor/pg_clipfrac': 0.0016713092336431146, 'actor/ppo_kl': 9.351108747068793e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002925819717347622, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00011086717859143391}
[36m(Runner pid=3309020)[0m Step 46
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.241
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.037
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.011
[36m(Runner pid=3309020)[0m ppo_kl: 9.967131319533196e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.013
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.013
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.669
[36m(Runner pid=3309020)[0m min: 0.5
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.669
[36m(Runner pid=3309020)[0m min: 0.5
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:23<1:41:07, 4.76s/it, est. speed input: 99.82 toks/s, output: 24.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:28<53:03, 2.51s/it, est. speed input: 162.97 toks/s, output: 44.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:31<34:05, 1.62s/it, est. speed input: 218.96 toks/s, output: 65.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:31<21:36, 1.03s/it, est. speed input: 283.32 toks/s, output: 87.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:32<10:47, 1.93it/s, est. speed input: 412.79 toks/s, output: 133.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:33<09:20, 2.22it/s, est. speed input: 466.02 toks/s, output: 153.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:34<07:45, 2.67it/s, est. speed input: 515.56 toks/s, output: 177.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:35<05:46, 3.57it/s, est. speed input: 576.53 toks/s, output: 198.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:35<04:38, 4.42it/s, est. speed input: 632.05 toks/s, output: 214.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:36<04:18, 4.73it/s, est. speed input: 680.89 toks/s, output: 233.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:36<02:32, 7.96it/s, est. speed input: 803.41 toks/s, output: 275.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:38<02:58, 6.74it/s, est. speed input: 885.38 toks/s, output: 307.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:38<01:59, 9.96it/s, est. speed input: 1001.53 toks/s, output: 352.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:38<01:49, 10.86it/s, est. speed input: 1053.88 toks/s, output: 372.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:39<01:18, 15.13it/s, est. speed input: 1166.91 toks/s, output: 409.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:39<01:32, 12.65it/s, est. speed input: 1207.10 toks/s, output: 426.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:39<01:22, 14.19it/s, est. speed input: 1260.23 toks/s, output: 451.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:40<01:29, 13.05it/s, est. speed input: 1304.60 toks/s, output: 472.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:41<02:01, 9.52it/s, est. speed input: 1327.32 toks/s, output: 489.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:41<01:01, 18.56it/s, est. speed input: 1497.10 toks/s, output: 567.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:41<00:54, 20.88it/s, est. speed input: 1548.67 toks/s, output: 591.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:41<00:48, 23.50it/s, est. speed input: 1602.59 toks/s, output: 612.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:41<00:26, 42.12it/s, est. speed input: 1823.86 toks/s, output: 710.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:42<00:26, 42.20it/s, est. speed input: 1920.91 toks/s, output: 753.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:42<00:27, 39.65it/s, est. speed input: 2018.70 toks/s, output: 801.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:42<00:33, 31.95it/s, est. speed input: 2106.64 toks/s, output: 854.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:43<00:24, 42.60it/s, est. speed input: 2308.18 toks/s, output: 956.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:44<00:39, 26.62it/s, est. speed input: 2405.20 toks/s, output: 1006.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:44<00:36, 28.39it/s, est. speed input: 2447.47 toks/s, output: 1026.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:44<00:25, 40.29it/s, est. speed input: 2599.23 toks/s, output: 1107.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:44<00:22, 44.76it/s, est. speed input: 2695.44 toks/s, output: 1155.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:44<00:16, 59.64it/s, est. speed input: 2836.02 toks/s, output: 1235.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:44<00:14, 66.43it/s, est. speed input: 2936.44 toks/s, output: 1293.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:44<00:16, 61.07it/s, est. speed input: 3026.30 toks/s, output: 1342.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:45<00:10, 89.67it/s, est. speed input: 3317.15 toks/s, output: 1487.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:45<00:15, 60.06it/s, est. speed input: 3435.99 toks/s, output: 1555.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:45<00:12, 71.74it/s, est. speed input: 3621.77 toks/s, output: 1663.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:45<00:10, 87.14it/s, est. speed input: 3823.58 toks/s, output: 1774.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:46<00:08, 101.21it/s, est. speed input: 4014.86 toks/s, output: 1886.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:46<00:10, 85.85it/s, est. speed input: 4145.46 toks/s, output: 1963.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:46<00:07, 109.62it/s, est. speed input: 4389.00 toks/s, output: 2090.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:46<00:07, 112.58it/s, est. speed input: 4527.94 toks/s, output: 2171.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:47<00:12, 63.85it/s, est. speed input: 4626.13 toks/s, output: 2238.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:47<00:09, 80.46it/s, est. speed input: 4815.29 toks/s, output: 2354.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:47<00:10, 75.67it/s, est. speed input: 4941.04 toks/s, output: 2425.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:47<00:09, 77.38it/s, est. speed input: 5071.78 toks/s, output: 2515.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:47<00:07, 97.22it/s, est. speed input: 5253.88 toks/s, output: 2618.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:48<00:11, 62.49it/s, est. speed input: 5343.11 toks/s, output: 2680.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:48<00:11, 64.91it/s, est. speed input: 5419.76 toks/s, output: 2729.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:48<00:09, 75.15it/s, est. speed input: 5587.15 toks/s, output: 2825.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:48<00:09, 73.13it/s, est. speed input: 5705.03 toks/s, output: 2914.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:48<00:08, 74.81it/s, est. speed input: 5783.83 toks/s, output: 2959.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:49<00:10, 60.36it/s, est. speed input: 5852.49 toks/s, output: 2995.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:49<00:08, 77.59it/s, est. speed input: 6032.11 toks/s, output: 3113.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:49<00:09, 63.38it/s, est. speed input: 6132.37 toks/s, output: 3161.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:49<00:09, 64.15it/s, est. speed input: 6209.58 toks/s, output: 3212.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:49<00:07, 80.52it/s, est. speed input: 6380.48 toks/s, output: 3333.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:49<00:07, 80.93it/s, est. speed input: 6462.80 toks/s, output: 3380.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:50<00:05, 105.76it/s, est. speed input: 6683.53 toks/s, output: 3520.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:50<00:04, 111.42it/s, est. speed input: 6811.45 toks/s, output: 3606.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:50<00:06, 86.54it/s, est. speed input: 6907.17 toks/s, output: 3688.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:50<00:07, 71.04it/s, est. speed input: 6997.93 toks/s, output: 3762.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:50<00:06, 83.06it/s, est. speed input: 7116.31 toks/s, output: 3867.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:51<00:05, 90.61it/s, est. speed input: 7236.25 toks/s, output: 3933.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:51<00:04, 107.18it/s, est. speed input: 7394.32 toks/s, output: 4062.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:51<00:03, 116.95it/s, est. speed input: 7567.71 toks/s, output: 4192.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:51<00:04, 95.20it/s, est. speed input: 7681.06 toks/s, output: 4279.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:51<00:05, 75.15it/s, est. speed input: 7770.16 toks/s, output: 4352.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:52<00:05, 75.47it/s, est. speed input: 7846.61 toks/s, output: 4415.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:52<00:03, 110.32it/s, est. speed input: 8099.13 toks/s, output: 4634.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:52<00:03, 115.90it/s, est. speed input: 8214.81 toks/s, output: 4745.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:52<00:02, 128.25it/s, est. speed input: 8416.62 toks/s, output: 4921.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:52<00:01, 160.68it/s, est. speed input: 8663.95 toks/s, output: 5156.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:52<00:02, 127.23it/s, est. speed input: 8816.25 toks/s, output: 5255.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:52<00:02, 129.75it/s, est. speed input: 8931.60 toks/s, output: 5371.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:53<00:02, 113.15it/s, est. speed input: 9026.77 toks/s, output: 5480.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:53<00:02, 103.59it/s, est. speed input: 9127.16 toks/s, output: 5578.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:53<00:02, 107.66it/s, est. speed input: 9230.28 toks/s, output: 5700.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:53<00:02, 102.06it/s, est. speed input: 9330.87 toks/s, output: 5807.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:53<00:01, 130.64it/s, est. speed input: 9527.53 toks/s, output: 6023.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:53<00:01, 116.84it/s, est. speed input: 9656.76 toks/s, output: 6124.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:54<00:01, 99.51it/s, est. speed input: 9743.03 toks/s, output: 6234.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:54<00:01, 111.52it/s, est. speed input: 9927.01 toks/s, output: 6453.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:54<00:01, 78.96it/s, est. speed input: 9995.88 toks/s, output: 6531.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:54<00:01, 78.91it/s, est. speed input: 10072.59 toks/s, output: 6595.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:54<00:01, 73.60it/s, est. speed input: 10128.75 toks/s, output: 6656.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:55<00:01, 62.55it/s, est. speed input: 10165.56 toks/s, output: 6727.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:55<00:01, 62.15it/s, est. speed input: 10219.86 toks/s, output: 6779.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:55<00:01, 39.92it/s, est. speed input: 10214.04 toks/s, output: 6802.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:55<00:01, 45.99it/s, est. speed input: 10268.10 toks/s, output: 6838.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:56<00:00, 49.59it/s, est. speed input: 10317.82 toks/s, output: 6916.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:56<00:00, 42.20it/s, est. speed input: 10344.06 toks/s, output: 6952.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:56<00:00, 47.15it/s, est. speed input: 10402.21 toks/s, output: 7049.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:56<00:00, 39.15it/s, est. speed input: 10415.31 toks/s, output: 7116.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:58<00:00, 11.00it/s, est. speed input: 10098.77 toks/s, output: 6910.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:00<00:00, 8.15it/s, est. speed input: 9927.12 toks/s, output: 6836.75 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:00<00:00, 21.26it/s, est. speed input: 9927.12 toks/s, output: 6836.75 toks/s]
[36m(Runner pid=3309020)[0m balanced_max: 971983
[36m(Runner pid=3309020)[0m balanced_min: 970670
[36m(Runner pid=3309020)[0m max: 978553
[36m(Runner pid=3309020)[0m mean: 971326.5
[36m(Runner pid=3309020)[0m min: 964100
[36m(Runner pid=3309020)[0m minmax_diff: 14453
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 109.508
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.12
[36m(Runner pid=3309020)[0m throughput: 1122.215
[36m(Runner pid=3309020)[0m time_per_step: 865.544
[36m(Runner pid=3309020)[0m total_num_tokens: 1942653
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 612.0
[36m(Runner pid=3309020)[0m mean: 465.432
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3151.0
[36m(Runner pid=3309020)[0m mean: 293.417
[36m(Runner pid=3309020)[0m min: 43.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.338
[36m(Runner pid=3309020)[0m format: 1.0
[36m(Runner pid=3309020)[0m overall: 0.669
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.166
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.289
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.202
[36m(Runner pid=3309020)[0m gen: 124.66
[36m(Runner pid=3309020)[0m old: 85.86
[36m(Runner pid=3309020)[0m ref: 86.884
[36m(Runner pid=3309020)[0m reward: 5.929
[36m(Runner pid=3309020)[0m step: 865.544
[36m(Runner pid=3309020)[0m update_actor: 561.279
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 47; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 08:58:39 [executor_base.py:219] It took 0.340083 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:00:09 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 08:58:39 [executor_base.py:219] It took 0.340185 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:00:09 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:00:09 [executor_base.py:208] It took 0.327656 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.75 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:00:10 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:00:10 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:00:10 [executor_base.py:208] It took 0.325633 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.5290394425392151, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.7031172513961792, 'actor/pg_clipfrac': 0.002730748150497675, 'actor/ppo_kl': -0.0008228099904954433}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.061657704412937164, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.34800082445144653, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00029069246375001967, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017907186411321163}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.031403813511133194, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00019444695499259979, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3678443431854248, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.1398821920156479, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00020316473091952503, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004994589253328741}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00026516689104028046, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.13969171047210693, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.1273421049118042, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004895186284556985}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.5235476493835449, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00016366034105885774, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00025317390100099146, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002249599201604724}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.06973161548376083, 'actor/pg_clipfrac': 0.002421307610347867, 'actor/ppo_kl': 0.0008084872388280928}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.2643362581729889, 'actor/pg_clipfrac': 0.002385211642831564, 'actor/ppo_kl': 0.0007709669298492372}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3024095296859741, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005716967862099409}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0003842898004222661, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011093448847532272}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.2397586852312088, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -9.91944907582365e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.12819267809391022, 'actor/pg_clipfrac': 0.0014720313483849168, 'actor/ppo_kl': -0.00048330696881748736}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0005051856278441846, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007655561785213649}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2748618423938751, 'actor/pg_clipfrac': 0.0033149172086268663, 'actor/ppo_kl': 0.00017385746468789876}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3090002238750458, 'actor/pg_clipfrac': 0.00048543690354563296, 'actor/ppo_kl': 0.0015203235670924187}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.611401379108429, 'actor/pg_clipfrac': 0.0010471204295754433, 'actor/ppo_kl': -0.0004970310837961733}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.19022418558597565, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 5.800438884762116e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00024235238379333168, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002976130635943264}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.476209819316864, 'actor/pg_clipfrac': 0.0012531328247860074, 'actor/ppo_kl': 0.001786081469617784}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.4860425293445587, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001326577621512115}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00044852090650238097, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007560185040347278}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00021011382341384888, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005127833574078977}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.36697128415107727, 'actor/pg_clipfrac': 0.0009881423320621252, 'actor/ppo_kl': 0.0006270993035286665}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.28442147374153137, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0020387929398566484}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.17236268520355225, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006826891913078725}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.36585870385169983, 'actor/pg_clipfrac': 0.003841229248791933, 'actor/ppo_kl': -0.0012140237959101796}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00030805953429080546, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00046425446635112166}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.02550116553902626, 'actor/pg_clipfrac': 0.002358490601181984, 'actor/ppo_kl': -0.00011079041723860428}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4548836946487427, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016137372003868222}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.4476025104522705, 'actor/pg_clipfrac': 0.0009737098589539528, 'actor/ppo_kl': 0.0021082498133182526}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00017229812510777265, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00103098398540169}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00034816141123883426, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0024197762832045555}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.27146202325820923, 'actor/pg_clipfrac': 0.0006443298771046102, 'actor/ppo_kl': 0.0009910335065796971}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.18276576697826385, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006359926192089915}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0001835862931329757, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00016939039051067084}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.3147898018360138, 'actor/pg_clipfrac': 0.002016128972172737, 'actor/ppo_kl': -0.0001563660625834018}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0004802571202162653, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011095167137682438}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00024272403970826417, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000635050586424768}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.23314917087554932, 'actor/pg_clipfrac': 0.00479233218356967, 'actor/ppo_kl': -0.0002259781613247469}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.15824154019355774, 'actor/pg_clipfrac': 0.0011061946861445904, 'actor/ppo_kl': 0.0014940362889319658}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.48370814323425293, 'actor/pg_clipfrac': 0.0021786491852253675, 'actor/ppo_kl': -0.003629150567576289}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.26240867376327515, 'actor/pg_clipfrac': 0.0009727626456879079, 'actor/ppo_kl': 0.0010739745339378715}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.24865376949310303, 'actor/pg_clipfrac': 0.0014662756584584713, 'actor/ppo_kl': 0.00188602099660784}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00023036685888655484, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00012851934297941625}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.09880391508340836, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011444505071267486}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.19074629247188568, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005762454820796847}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.5708229541778564, 'actor/pg_clipfrac': 0.0012903226306661963, 'actor/ppo_kl': -0.0024982329923659563}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.30612877011299133, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016309674829244614}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2542456090450287, 'actor/pg_clipfrac': 0.0011587485205382109, 'actor/ppo_kl': 0.0006638324703089893}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00016994560428429395, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004693133232649416}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.5210877060890198, 'actor/pg_clipfrac': 0.0012210012646391988, 'actor/ppo_kl': 0.0010536040645092726}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.421016663312912, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002714098081924021}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.12915384769439697, 'actor/pg_clipfrac': 0.0009372071363031864, 'actor/ppo_kl': -0.0008833794854581356}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0003189165727235377, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00015929281653370708}
[36m(Runner pid=3309020)[0m Step 47
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.231
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.036
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:35:42, 4.50s/it, est. speed input: 106.13 toks/s, output: 23.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:22<39:47, 1.88s/it, est. speed input: 206.97 toks/s, output: 48.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:27<30:41, 1.46s/it, est. speed input: 252.43 toks/s, output: 62.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:31<24:39, 1.17s/it, est. speed input: 297.54 toks/s, output: 77.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<16:22, 1.28it/s, est. speed input: 365.88 toks/s, output: 99.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:33<13:52, 1.50it/s, est. speed input: 409.63 toks/s, output: 115.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:34<10:03, 2.06it/s, est. speed input: 471.54 toks/s, output: 135.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:35<07:47, 2.65it/s, est. speed input: 524.48 toks/s, output: 149.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<03:29, 5.84it/s, est. speed input: 719.67 toks/s, output: 215.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<02:20, 8.62it/s, est. speed input: 847.22 toks/s, output: 259.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<02:10, 9.27it/s, est. speed input: 899.45 toks/s, output: 276.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<01:57, 10.30it/s, est. speed input: 955.41 toks/s, output: 293.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<02:15, 8.84it/s, est. speed input: 998.59 toks/s, output: 313.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:37<01:49, 10.93it/s, est. speed input: 1053.39 toks/s, output: 336.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<01:28, 13.40it/s, est. speed input: 1114.75 toks/s, output: 363.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<00:57, 20.64it/s, est. speed input: 1236.16 toks/s, output: 409.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:37<00:55, 21.25it/s, est. speed input: 1289.08 toks/s, output: 429.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:37<00:49, 23.65it/s, est. speed input: 1343.96 toks/s, output: 456.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:38<00:44, 25.98it/s, est. speed input: 1402.27 toks/s, output: 480.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:41, 28.14it/s, est. speed input: 1457.59 toks/s, output: 504.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<00:32, 34.94it/s, est. speed input: 1623.02 toks/s, output: 576.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:35, 32.25it/s, est. speed input: 1669.22 toks/s, output: 599.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:40, 27.72it/s, est. speed input: 1717.97 toks/s, output: 619.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:39<00:59, 18.99it/s, est. speed input: 1754.89 toks/s, output: 641.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:39<00:32, 34.50it/s, est. speed input: 1975.11 toks/s, output: 733.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:30, 36.41it/s, est. speed input: 2079.96 toks/s, output: 771.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:23, 46.85it/s, est. speed input: 2248.57 toks/s, output: 837.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:20, 53.43it/s, est. speed input: 2361.24 toks/s, output: 895.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:34, 30.67it/s, est. speed input: 2435.79 toks/s, output: 934.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:41<00:36, 28.86it/s, est. speed input: 2523.85 toks/s, output: 973.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:41<00:29, 35.83it/s, est. speed input: 2629.89 toks/s, output: 1034.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:41<00:28, 35.88it/s, est. speed input: 2725.85 toks/s, output: 1079.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:42<00:32, 31.32it/s, est. speed input: 2809.37 toks/s, output: 1114.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:42<00:25, 39.17it/s, est. speed input: 2912.70 toks/s, output: 1167.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:42<00:27, 36.83it/s, est. speed input: 2996.06 toks/s, output: 1213.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:42<00:15, 61.33it/s, est. speed input: 3263.08 toks/s, output: 1344.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:14, 67.19it/s, est. speed input: 3360.66 toks/s, output: 1401.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:42<00:13, 73.05it/s, est. speed input: 3462.58 toks/s, output: 1461.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:43<00:10, 90.31it/s, est. speed input: 3659.70 toks/s, output: 1561.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:43<00:11, 79.21it/s, est. speed input: 3809.92 toks/s, output: 1639.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:43<00:12, 70.82it/s, est. speed input: 3896.07 toks/s, output: 1686.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:43<00:13, 66.41it/s, est. speed input: 3982.62 toks/s, output: 1734.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:11, 77.78it/s, est. speed input: 4126.43 toks/s, output: 1822.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:11, 77.76it/s, est. speed input: 4222.13 toks/s, output: 1867.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:44<00:11, 77.57it/s, est. speed input: 4312.96 toks/s, output: 1910.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:44<00:11, 77.99it/s, est. speed input: 4404.29 toks/s, output: 1964.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:44<00:14, 57.28it/s, est. speed input: 4479.47 toks/s, output: 2005.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:44<00:10, 80.14it/s, est. speed input: 4686.44 toks/s, output: 2121.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:44<00:08, 99.95it/s, est. speed input: 4874.87 toks/s, output: 2232.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:45<00:11, 70.91it/s, est. speed input: 4994.16 toks/s, output: 2306.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:45<00:10, 76.50it/s, est. speed input: 5131.63 toks/s, output: 2400.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:45<00:11, 64.92it/s, est. speed input: 5207.48 toks/s, output: 2447.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:45<00:10, 69.94it/s, est. speed input: 5293.00 toks/s, output: 2483.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:45<00:11, 65.44it/s, est. speed input: 5373.26 toks/s, output: 2537.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:45<00:11, 67.11it/s, est. speed input: 5459.64 toks/s, output: 2587.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:46<00:09, 73.36it/s, est. speed input: 5547.84 toks/s, output: 2637.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:46<00:09, 73.83it/s, est. speed input: 5626.95 toks/s, output: 2680.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:07, 96.28it/s, est. speed input: 5818.77 toks/s, output: 2797.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:46<00:07, 94.07it/s, est. speed input: 5995.42 toks/s, output: 2905.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:46<00:05, 110.47it/s, est. speed input: 6178.49 toks/s, output: 3006.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:46<00:05, 124.10it/s, est. speed input: 6358.09 toks/s, output: 3127.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:46<00:04, 147.26it/s, est. speed input: 6626.86 toks/s, output: 3271.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:47<00:04, 143.60it/s, est. speed input: 6803.94 toks/s, output: 3349.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:47<00:05, 107.01it/s, est. speed input: 6909.82 toks/s, output: 3422.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:47<00:04, 112.53it/s, est. speed input: 7047.67 toks/s, output: 3506.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:47<00:04, 124.95it/s, est. speed input: 7217.82 toks/s, output: 3616.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:47<00:05, 104.00it/s, est. speed input: 7332.28 toks/s, output: 3679.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:47<00:04, 105.57it/s, est. speed input: 7544.27 toks/s, output: 3814.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:48<00:05, 87.25it/s, est. speed input: 7648.37 toks/s, output: 3872.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:48<00:04, 96.68it/s, est. speed input: 7782.58 toks/s, output: 3966.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:48<00:06, 71.26it/s, est. speed input: 7873.34 toks/s, output: 4048.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:48<00:06, 73.64it/s, est. speed input: 7953.51 toks/s, output: 4101.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:48<00:03, 110.14it/s, est. speed input: 8220.16 toks/s, output: 4319.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:49<00:04, 91.50it/s, est. speed input: 8317.42 toks/s, output: 4410.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:49<00:04, 81.85it/s, est. speed input: 8416.58 toks/s, output: 4488.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:49<00:04, 91.21it/s, est. speed input: 8580.89 toks/s, output: 4631.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:49<00:02, 128.18it/s, est. speed input: 8845.65 toks/s, output: 4832.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:49<00:02, 135.85it/s, est. speed input: 9011.71 toks/s, output: 4975.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:50<00:02, 112.03it/s, est. speed input: 9166.29 toks/s, output: 5073.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:50<00:01, 138.27it/s, est. speed input: 9427.53 toks/s, output: 5336.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:50<00:01, 142.06it/s, est. speed input: 9590.96 toks/s, output: 5473.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:50<00:01, 146.17it/s, est. speed input: 9752.77 toks/s, output: 5629.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:50<00:02, 88.29it/s, est. speed input: 9847.76 toks/s, output: 5719.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:51<00:02, 92.24it/s, est. speed input: 9996.45 toks/s, output: 5851.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:51<00:01, 90.19it/s, est. speed input: 10100.11 toks/s, output: 5958.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:51<00:01, 86.43it/s, est. speed input: 10189.65 toks/s, output: 6065.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:51<00:01, 104.23it/s, est. speed input: 10342.33 toks/s, output: 6212.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:51<00:01, 94.31it/s, est. speed input: 10434.78 toks/s, output: 6299.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:51<00:01, 103.96it/s, est. speed input: 10544.16 toks/s, output: 6422.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:52<00:01, 83.30it/s, est. speed input: 10622.82 toks/s, output: 6507.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:52<00:00, 90.84it/s, est. speed input: 10737.93 toks/s, output: 6610.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:52<00:00, 67.73it/s, est. speed input: 10794.81 toks/s, output: 6703.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:52<00:00, 53.86it/s, est. speed input: 10816.95 toks/s, output: 6736.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:53<00:00, 48.28it/s, est. speed input: 10839.02 toks/s, output: 6811.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:53<00:00, 49.48it/s, est. speed input: 10905.12 toks/s, output: 6912.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 38.04it/s, est. speed input: 10902.78 toks/s, output: 6952.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:54<00:00, 23.67it/s, est. speed input: 11018.07 toks/s, output: 7092.54 toks/s]
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: -2.274166946207856e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.651
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.651
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 993832
[36m(Runner pid=3309020)[0m balanced_min: 993831
[36m(Runner pid=3309020)[0m max: 1009510
[36m(Runner pid=3309020)[0m mean: 993831.5
[36m(Runner pid=3309020)[0m min: 978153
[36m(Runner pid=3309020)[0m minmax_diff: 31357
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.477
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.122
[36m(Runner pid=3309020)[0m throughput: 1162.285
[36m(Runner pid=3309020)[0m time_per_step: 855.067
[36m(Runner pid=3309020)[0m total_num_tokens: 1987663
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 467.355
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1599.0
[36m(Runner pid=3309020)[0m mean: 309.075
[36m(Runner pid=3309020)[0m min: 48.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.304
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.651
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.279902775934044e-05
[36m(Runner pid=3309020)[0m gen: 0.133
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.284
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.184
[36m(Runner pid=3309020)[0m gen: 105.193
[36m(Runner pid=3309020)[0m old: 88.637
[36m(Runner pid=3309020)[0m ref: 89.288
[36m(Runner pid=3309020)[0m reward: 6.283
[36m(Runner pid=3309020)[0m step: 855.067
[36m(Runner pid=3309020)[0m update_actor: 564.917
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 48; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:12:55 [executor_base.py:219] It took 0.340687 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:14:20 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:12:55 [executor_base.py:219] It took 0.340816 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:14:20 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:14:20 [executor_base.py:208] It took 0.325405 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:14:33 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:14:34 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:14:34 [executor_base.py:208] It took 0.327715 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.07480689883232117, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.23273561894893646, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.668880045413971, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4508422911167145, 'actor/pg_clipfrac': 0.0009596928721293807, 'actor/ppo_kl': -0.0004779079754371196}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.29853555560112, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.29373425245285034, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0003174036683049053, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.09368038177490234, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00019692929345183074, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00016481743659824133}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.4752519428730011, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003128934185951948, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005315847811289132}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.000409657193813473, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1244855746626854, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2880445420742035, 'actor/pg_clipfrac': 0.0004752851673401892, 'actor/ppo_kl': 0.00017217357526533306}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3504471778869629, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001267308834940195}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00019308293121866882, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007927782717160881}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.17087224125862122, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005877921939827502}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.4092871844768524, 'actor/pg_clipfrac': 0.0021953897085040808, 'actor/ppo_kl': -0.0006323185516521335}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.08305101096630096, 'actor/pg_clipfrac': 0.0017256255960091949, 'actor/ppo_kl': 0.0007843448547646403}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.5802338123321533, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00027901484281755984}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.21248489618301392, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001701759290881455}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.34174787998199463, 'actor/pg_clipfrac': 0.0013586956774815917, 'actor/ppo_kl': 0.0005875825881958008}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.23897041380405426, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00038608303293585777}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.12792715430259705, 'actor/pg_clipfrac': 0.0028818442951887846, 'actor/ppo_kl': -0.00033518948475830257}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.24709884822368622, 'actor/pg_clipfrac': 0.0010010009864345193, 'actor/ppo_kl': -0.00010507219121791422}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.2562344968318939, 'actor/pg_clipfrac': 0.0008764241938479245, 'actor/ppo_kl': 0.0002520893176551908}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.11098109185695648, 'actor/pg_clipfrac': 0.002314814832061529, 'actor/ppo_kl': 0.001950361067429185}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1911388337612152, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00038934280746616423}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002740547643043101, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012876663822680712}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.05531277507543564, 'actor/pg_clipfrac': 0.0006253908504731953, 'actor/ppo_kl': 0.0009506791247986257}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.20104636251926422, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00033426526351831853}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.17503070831298828, 'actor/pg_clipfrac': 0.0006393861840479076, 'actor/ppo_kl': 0.0005030034226365387}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.02865590713918209, 'actor/pg_clipfrac': 0.007352941203862429, 'actor/ppo_kl': -9.018065611599013e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.3765202462673187, 'actor/pg_clipfrac': 0.0013440860202535987, 'actor/ppo_kl': -0.0006054755067452788}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.11668366938829422, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 3.1067618692759424e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.027822747826576233, 'actor/pg_clipfrac': 0.0006954103009775281, 'actor/ppo_kl': 0.0013862458290532231}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3257010281085968, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.8291231754119508e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00022865126084070653, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00041393350693397224}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.005257087294012308, 'actor/pg_clipfrac': 0.0009157509193755686, 'actor/ppo_kl': -4.784091652254574e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.30700206756591797, 'actor/pg_clipfrac': 0.0006738544325344265, 'actor/ppo_kl': -0.000649851921480149}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5609527230262756, 'actor/pg_clipfrac': 0.001853568130172789, 'actor/ppo_kl': -0.0008190604276023805}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002566410112194717, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004993520560674369}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.05556483566761017, 'actor/pg_clipfrac': 0.0011166945332661271, 'actor/ppo_kl': -0.0011959384428337216}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.4748503565788269, 'actor/pg_clipfrac': 0.0011806375114247203, 'actor/ppo_kl': 0.0008038337109610438}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.4843127131462097, 'actor/pg_clipfrac': 0.00223380490206182, 'actor/ppo_kl': -0.0011006835848093033}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.06794440746307373, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0019067191751673818}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.40227010846138, 'actor/pg_clipfrac': 0.002691789995878935, 'actor/ppo_kl': -0.0002440122771076858}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.22357851266860962, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005375105538405478}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.2247605323791504, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005311407148838043}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00867747887969017, 'actor/pg_clipfrac': 0.0013850415125489235, 'actor/ppo_kl': 0.00115879881195724}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.5375062823295593, 'actor/pg_clipfrac': 0.0006891798693686724, 'actor/ppo_kl': 0.0007371330866590142}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.03778383135795593, 'actor/pg_clipfrac': 0.001855287584476173, 'actor/ppo_kl': -0.0013583418913185596}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.1750437617301941, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009148359531536698}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0002133576781488955, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003173957811668515}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.38657140731811523, 'actor/pg_clipfrac': 0.0020040080416947603, 'actor/ppo_kl': -3.599371848395094e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.13625767827033997, 'actor/pg_clipfrac': 0.0005518763791769743, 'actor/ppo_kl': 0.00044484695536084473}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2308557778596878, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011171934893354774}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.09992627054452896, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00023188242630567402}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.09540260583162308, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002471210202202201}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:15<1:07:34, 3.18s/it, est. speed input: 142.77 toks/s, output: 24.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:24<49:27, 2.34s/it, est. speed input: 182.08 toks/s, output: 38.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:24<27:20, 1.30s/it, est. speed input: 277.02 toks/s, output: 63.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<21:06, 1.01s/it, est. speed input: 333.00 toks/s, output: 79.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:29<11:32, 1.80it/s, est. speed input: 472.12 toks/s, output: 119.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<10:21, 2.00it/s, est. speed input: 519.86 toks/s, output: 137.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<10:09, 2.03it/s, est. speed input: 552.10 toks/s, output: 147.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<07:34, 2.72it/s, est. speed input: 612.87 toks/s, output: 170.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<06:18, 3.25it/s, est. speed input: 666.48 toks/s, output: 189.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:34<03:42, 5.49it/s, est. speed input: 793.03 toks/s, output: 232.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:34<02:56, 6.89it/s, est. speed input: 859.00 toks/s, output: 251.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:35<01:58, 10.21it/s, est. speed input: 973.63 toks/s, output: 302.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:10, 16.92it/s, est. speed input: 1159.83 toks/s, output: 363.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:35<01:10, 16.91it/s, est. speed input: 1212.53 toks/s, output: 382.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:35<00:51, 22.74it/s, est. speed input: 1331.85 toks/s, output: 422.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:36<00:36, 31.65it/s, est. speed input: 1519.75 toks/s, output: 484.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:36<00:29, 39.27it/s, est. speed input: 1758.01 toks/s, output: 560.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:36<00:33, 33.85it/s, est. speed input: 1863.44 toks/s, output: 600.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:37<00:33, 34.06it/s, est. speed input: 1920.64 toks/s, output: 625.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:37<00:29, 37.19it/s, est. speed input: 2033.96 toks/s, output: 672.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:37<00:32, 34.41it/s, est. speed input: 2140.08 toks/s, output: 715.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:38<00:35, 30.81it/s, est. speed input: 2242.43 toks/s, output: 765.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:38<00:39, 27.62it/s, est. speed input: 2289.07 toks/s, output: 782.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:38<00:30, 34.99it/s, est. speed input: 2400.74 toks/s, output: 823.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:38<00:35, 30.11it/s, est. speed input: 2440.60 toks/s, output: 841.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:38<00:27, 38.92it/s, est. speed input: 2551.25 toks/s, output: 891.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:39<00:34, 30.33it/s, est. speed input: 2641.63 toks/s, output: 931.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:39<00:39, 26.41it/s, est. speed input: 2682.45 toks/s, output: 947.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:39<00:36, 28.78it/s, est. speed input: 2732.81 toks/s, output: 971.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:39<00:33, 31.20it/s, est. speed input: 2779.90 toks/s, output: 999.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:40<00:26, 38.42it/s, est. speed input: 2879.13 toks/s, output: 1045.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:40<00:28, 35.91it/s, est. speed input: 2925.14 toks/s, output: 1069.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:40<00:30, 33.32it/s, est. speed input: 3017.79 toks/s, output: 1113.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:40<00:30, 32.62it/s, est. speed input: 3062.11 toks/s, output: 1134.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:41<00:36, 27.06it/s, est. speed input: 3139.60 toks/s, output: 1164.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:33, 29.95it/s, est. speed input: 3186.78 toks/s, output: 1186.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:41<00:30, 32.99it/s, est. speed input: 3234.09 toks/s, output: 1207.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:41<00:25, 37.85it/s, est. speed input: 3328.63 toks/s, output: 1267.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:41<00:12, 75.34it/s, est. speed input: 3596.38 toks/s, output: 1378.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:17, 52.92it/s, est. speed input: 3722.49 toks/s, output: 1444.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:42<00:13, 69.27it/s, est. speed input: 3930.27 toks/s, output: 1548.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:42<00:13, 66.76it/s, est. speed input: 4063.94 toks/s, output: 1605.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:42<00:09, 89.49it/s, est. speed input: 4327.66 toks/s, output: 1738.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:42<00:11, 72.21it/s, est. speed input: 4463.12 toks/s, output: 1809.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:43<00:14, 58.32it/s, est. speed input: 4541.08 toks/s, output: 1849.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:43<00:16, 52.55it/s, est. speed input: 4614.02 toks/s, output: 1893.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:43<00:11, 70.82it/s, est. speed input: 4811.69 toks/s, output: 2010.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:43<00:13, 60.74it/s, est. speed input: 4887.21 toks/s, output: 2060.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:44<00:09, 79.67it/s, est. speed input: 5077.45 toks/s, output: 2173.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:44<00:08, 88.36it/s, est. speed input: 5219.41 toks/s, output: 2246.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:44<00:09, 83.59it/s, est. speed input: 5352.92 toks/s, output: 2318.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:44<00:08, 86.30it/s, est. speed input: 5493.37 toks/s, output: 2395.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:44<00:07, 99.10it/s, est. speed input: 5719.63 toks/s, output: 2529.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:44<00:07, 92.98it/s, est. speed input: 5843.93 toks/s, output: 2588.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:45<00:07, 91.21it/s, est. speed input: 5982.14 toks/s, output: 2660.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:45<00:07, 92.64it/s, est. speed input: 6073.53 toks/s, output: 2723.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:45<00:08, 82.46it/s, est. speed input: 6155.63 toks/s, output: 2785.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:45<00:11, 58.56it/s, est. speed input: 6216.84 toks/s, output: 2821.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:45<00:09, 70.93it/s, est. speed input: 6354.94 toks/s, output: 2911.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:46<00:10, 60.02it/s, est. speed input: 6414.66 toks/s, output: 2960.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:46<00:09, 66.12it/s, est. speed input: 6543.82 toks/s, output: 3045.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:46<00:08, 70.94it/s, est. speed input: 6670.12 toks/s, output: 3121.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:46<00:08, 70.33it/s, est. speed input: 6743.18 toks/s, output: 3182.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:46<00:08, 73.68it/s, est. speed input: 6827.13 toks/s, output: 3232.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:46<00:08, 68.75it/s, est. speed input: 6900.99 toks/s, output: 3264.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:47<00:09, 59.37it/s, est. speed input: 7002.92 toks/s, output: 3342.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:47<00:11, 48.23it/s, est. speed input: 7056.57 toks/s, output: 3384.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:47<00:12, 43.62it/s, est. speed input: 7109.89 toks/s, output: 3430.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:48<00:12, 42.28it/s, est. speed input: 7200.48 toks/s, output: 3479.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:48<00:10, 49.67it/s, est. speed input: 7280.99 toks/s, output: 3535.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:48<00:09, 53.24it/s, est. speed input: 7349.03 toks/s, output: 3577.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:48<00:09, 51.51it/s, est. speed input: 7416.58 toks/s, output: 3628.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:48<00:06, 74.90it/s, est. speed input: 7584.42 toks/s, output: 3770.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:48<00:05, 80.47it/s, est. speed input: 7705.23 toks/s, output: 3864.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:48<00:04, 91.46it/s, est. speed input: 7829.30 toks/s, output: 3956.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:49<00:04, 106.46it/s, est. speed input: 7993.13 toks/s, output: 4077.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:49<00:03, 107.29it/s, est. speed input: 8109.07 toks/s, output: 4197.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:49<00:04, 90.63it/s, est. speed input: 8221.38 toks/s, output: 4272.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:49<00:03, 110.78it/s, est. speed input: 8386.24 toks/s, output: 4399.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:49<00:02, 148.92it/s, est. speed input: 8664.75 toks/s, output: 4608.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:49<00:02, 121.77it/s, est. speed input: 8807.37 toks/s, output: 4718.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:50<00:03, 104.50it/s, est. speed input: 8910.62 toks/s, output: 4818.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:50<00:03, 98.87it/s, est. speed input: 9022.10 toks/s, output: 4912.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:50<00:02, 116.44it/s, est. speed input: 9197.52 toks/s, output: 5057.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:50<00:01, 153.75it/s, est. speed input: 9445.39 toks/s, output: 5265.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:50<00:02, 112.35it/s, est. speed input: 9569.61 toks/s, output: 5361.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:50<00:01, 123.99it/s, est. speed input: 9727.96 toks/s, output: 5501.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:51<00:01, 119.52it/s, est. speed input: 9869.10 toks/s, output: 5652.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:51<00:01, 104.27it/s, est. speed input: 9959.81 toks/s, output: 5740.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:51<00:01, 93.98it/s, est. speed input: 10096.84 toks/s, output: 5858.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:51<00:01, 85.86it/s, est. speed input: 10187.98 toks/s, output: 5978.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:51<00:01, 81.68it/s, est. speed input: 10249.05 toks/s, output: 6067.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:51<00:01, 83.92it/s, est. speed input: 10318.42 toks/s, output: 6150.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:52<00:01, 86.51it/s, est. speed input: 10418.82 toks/s, output: 6277.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:52<00:01, 87.42it/s, est. speed input: 10487.04 toks/s, output: 6333.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:52<00:01, 61.49it/s, est. speed input: 10513.09 toks/s, output: 6399.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:52<00:01, 66.26it/s, est. speed input: 10578.49 toks/s, output: 6486.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:52<00:01, 53.93it/s, est. speed input: 10614.44 toks/s, output: 6530.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:53<00:01, 45.51it/s, est. speed input: 10639.34 toks/s, output: 6587.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:53<00:00, 52.36it/s, est. speed input: 10700.36 toks/s, output: 6669.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:53<00:00, 57.33it/s, est. speed input: 10754.68 toks/s, output: 6734.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:53<00:00, 47.25it/s, est. speed input: 10795.66 toks/s, output: 6816.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 43.95it/s, est. speed input: 10823.67 toks/s, output: 6871.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:54<00:00, 30.95it/s, est. speed input: 10788.15 toks/s, output: 6902.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:09<00:00, 1.97it/s, est. speed input: 8549.57 toks/s, output: 5511.77 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:09<00:00, 18.45it/s, est. speed input: 8549.57 toks/s, output: 5511.77 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.3897044360637665, 'actor/pg_clipfrac': 0.002197802299633622, 'actor/ppo_kl': -0.0003626226098276675}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0003078643640037626, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.2026798685838003e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.10494152456521988, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004277254338376224}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.09587360918521881, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008000073139555752}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002908438618760556, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004074854659847915}
[36m(Runner pid=3309020)[0m Step 48
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.247
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.041
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.009
[36m(Runner pid=3309020)[0m ppo_kl: 3.9770794218441095e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.66
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.66
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 989743
[36m(Runner pid=3309020)[0m balanced_min: 988896
[36m(Runner pid=3309020)[0m max: 999459
[36m(Runner pid=3309020)[0m mean: 989319.5
[36m(Runner pid=3309020)[0m min: 979180
[36m(Runner pid=3309020)[0m minmax_diff: 20279
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.24
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.121
[36m(Runner pid=3309020)[0m throughput: 1149.091
[36m(Runner pid=3309020)[0m time_per_step: 860.958
[36m(Runner pid=3309020)[0m total_num_tokens: 1978639
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 679.0
[36m(Runner pid=3309020)[0m mean: 467.012
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 2431.0
[36m(Runner pid=3309020)[0m mean: 305.894
[36m(Runner pid=3309020)[0m min: 55.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.321
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.66
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.144
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.286
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.355
[36m(Runner pid=3309020)[0m gen: 112.596
[36m(Runner pid=3309020)[0m old: 86.583
[36m(Runner pid=3309020)[0m ref: 88.329
[36m(Runner pid=3309020)[0m reward: 6.485
[36m(Runner pid=3309020)[0m step: 860.958
[36m(Runner pid=3309020)[0m update_actor: 565.976
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 49; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:27:20 [executor_base.py:219] It took 0.340287 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:28:46 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:27:20 [executor_base.py:219] It took 0.339915 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:28:46 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.86 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:28:46 [executor_base.py:208] It took 0.326691 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:29:00 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:29:00 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:29:00 [executor_base.py:208] It took 0.325901 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.017623072490096092, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.18752989172935486, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.5237438678741455, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00033055461244657636, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010257004760205746}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.15564827620983124, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00015043267922010273}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.37673434615135193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.05028386041522026, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008253637352026999}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.04738591983914375, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000647135719191283}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00029659687425009906, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.11064204573631287, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.4121568500995636, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002560559078119695}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2660066485404968, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.512829601764679, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.21295061707496643, 'actor/pg_clipfrac': 0.0037499999161809683, 'actor/ppo_kl': 0.0019356870325282216}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6912704110145569, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010581372771412134}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.17229022085666656, 'actor/pg_clipfrac': 0.0016012809937819839, 'actor/ppo_kl': 0.00033040958805941045}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.07679414004087448, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00023337722814176232}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.10541495680809021, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005961913848295808}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.13854582607746124, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009487951174378395}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0875406339764595, 'actor/pg_clipfrac': 0.0011061946861445904, 'actor/ppo_kl': -0.0011648435611277819}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3905792236328125, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014461445389315486}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0004381632024887949, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000628958223387599}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.5574964880943298, 'actor/pg_clipfrac': 0.0013289035996422172, 'actor/ppo_kl': -0.0005120818968862295}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.1910976767539978, 'actor/pg_clipfrac': 0.0009115770226344466, 'actor/ppo_kl': -0.0001763262989697978}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.058373644948005676, 'actor/pg_clipfrac': 0.0032786885276436806, 'actor/ppo_kl': 1.4022660252521746e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.3270626664161682, 'actor/pg_clipfrac': 0.0014409221475943923, 'actor/ppo_kl': 0.0017357876058667898}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.37590280175209045, 'actor/pg_clipfrac': 0.0009643201483413577, 'actor/ppo_kl': -0.003316507674753666}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.05956907942891121, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012867606710642576}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.2646329402923584, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005313950823619962}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00040883489418774843, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005678259185515344}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.37467074394226074, 'actor/pg_clipfrac': 0.001357773202471435, 'actor/ppo_kl': -0.000893192074727267}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.08429745584726334, 'actor/pg_clipfrac': 0.0015515903942286968, 'actor/ppo_kl': -0.0005797363119199872}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.13612793385982513, 'actor/pg_clipfrac': 0.0009233610471710563, 'actor/ppo_kl': -0.0004539313667919487}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.11233280599117279, 'actor/pg_clipfrac': 0.0008103727595880628, 'actor/ppo_kl': 0.0007421951158903539}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2882618308067322, 'actor/pg_clipfrac': 0.0033783784601837397, 'actor/ppo_kl': -4.799462476512417e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.23431934416294098, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012743417173624039}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.06913045793771744, 'actor/pg_clipfrac': 0.002129925414919853, 'actor/ppo_kl': -0.0007933930610306561}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0004607884038705379, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008691635448485613}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00028021816979162395, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00013071909779682755}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.09271477907896042, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007213785429485142}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0827108696103096, 'actor/pg_clipfrac': 0.0006093845004215837, 'actor/ppo_kl': -0.0005924380384385586}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.1861681491136551, 'actor/pg_clipfrac': 0.0017467249417677522, 'actor/ppo_kl': -0.00024309783475473523}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00040359009290114045, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001072767423465848}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.19627854228019714, 'actor/pg_clipfrac': 0.0017761989729478955, 'actor/ppo_kl': 0.000616630888544023}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002652750408742577, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004919966449961066}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.000251397374086082, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011071746703237295}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.004344271961599588, 'actor/pg_clipfrac': 0.000749063678085804, 'actor/ppo_kl': -0.0006986682419665158}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.2738785743713379, 'actor/pg_clipfrac': 0.002020202111452818, 'actor/ppo_kl': -0.0012240477371960878}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.189778134226799, 'actor/pg_clipfrac': 0.0014749262481927872, 'actor/ppo_kl': -0.0016423790948465466}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.1535133272409439, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002408602595096454}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0003385573800187558, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003026867052540183}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.2151036560535431, 'actor/pg_clipfrac': 0.0008810572908259928, 'actor/ppo_kl': -0.0007618505042046309}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:31:13, 4.29s/it, est. speed input: 102.50 toks/s, output: 26.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<51:04, 2.41s/it, est. speed input: 162.72 toks/s, output: 44.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:29<32:39, 1.55s/it, est. speed input: 225.68 toks/s, output: 64.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:30<20:46, 1.01it/s, est. speed input: 294.94 toks/s, output: 85.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<14:28, 1.44it/s, est. speed input: 358.92 toks/s, output: 102.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:33<13:04, 1.59it/s, est. speed input: 400.49 toks/s, output: 118.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:33<08:58, 2.31it/s, est. speed input: 467.16 toks/s, output: 137.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<06:17, 3.28it/s, est. speed input: 530.28 toks/s, output: 159.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:35<05:51, 3.52it/s, est. speed input: 572.10 toks/s, output: 176.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:35<04:18, 4.75it/s, est. speed input: 634.32 toks/s, output: 197.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:35<01:13, 16.26it/s, est. speed input: 1019.13 toks/s, output: 324.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:07, 17.50it/s, est. speed input: 1134.26 toks/s, output: 369.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:36<01:00, 19.42it/s, est. speed input: 1247.53 toks/s, output: 412.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:36<00:42, 27.66it/s, est. speed input: 1439.58 toks/s, output: 485.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:36<00:35, 32.78it/s, est. speed input: 1557.69 toks/s, output: 530.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:36<00:36, 31.51it/s, est. speed input: 1667.88 toks/s, output: 574.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:37<00:32, 34.76it/s, est. speed input: 1782.16 toks/s, output: 619.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:37<00:27, 40.69it/s, est. speed input: 1897.94 toks/s, output: 661.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:37<00:23, 46.52it/s, est. speed input: 2009.82 toks/s, output: 705.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:37<00:23, 47.02it/s, est. speed input: 2120.76 toks/s, output: 752.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:38<00:38, 28.74it/s, est. speed input: 2201.54 toks/s, output: 789.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:38<00:46, 23.51it/s, est. speed input: 2236.91 toks/s, output: 801.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:38<00:50, 21.51it/s, est. speed input: 2278.80 toks/s, output: 823.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:39<00:41, 26.14it/s, est. speed input: 2384.01 toks/s, output: 867.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:39<00:37, 28.35it/s, est. speed input: 2486.98 toks/s, output: 919.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:39<00:26, 39.42it/s, est. speed input: 2651.08 toks/s, output: 982.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:39<00:24, 43.24it/s, est. speed input: 2754.60 toks/s, output: 1031.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:40<00:27, 37.83it/s, est. speed input: 2850.39 toks/s, output: 1075.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:40<00:22, 45.44it/s, est. speed input: 2951.67 toks/s, output: 1119.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:40<00:16, 60.80it/s, est. speed input: 3119.75 toks/s, output: 1193.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:40<00:17, 55.45it/s, est. speed input: 3221.82 toks/s, output: 1244.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:40<00:17, 56.76it/s, est. speed input: 3318.92 toks/s, output: 1293.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:40<00:15, 63.68it/s, est. speed input: 3426.45 toks/s, output: 1346.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:41<00:20, 46.09it/s, est. speed input: 3501.56 toks/s, output: 1384.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:41<00:26, 36.52it/s, est. speed input: 3580.43 toks/s, output: 1424.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:41<00:23, 39.44it/s, est. speed input: 3677.97 toks/s, output: 1473.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:24, 38.10it/s, est. speed input: 3719.93 toks/s, output: 1493.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:42<00:16, 55.82it/s, est. speed input: 3869.16 toks/s, output: 1571.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:42<00:20, 43.94it/s, est. speed input: 3943.25 toks/s, output: 1611.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:42<00:22, 40.44it/s, est. speed input: 4025.21 toks/s, output: 1661.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:42<00:16, 52.78it/s, est. speed input: 4174.00 toks/s, output: 1734.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:43<00:14, 59.73it/s, est. speed input: 4316.56 toks/s, output: 1810.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:43<00:13, 62.72it/s, est. speed input: 4406.10 toks/s, output: 1866.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:43<00:14, 57.25it/s, est. speed input: 4490.83 toks/s, output: 1909.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:43<00:16, 51.05it/s, est. speed input: 4570.04 toks/s, output: 1950.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:43<00:15, 53.76it/s, est. speed input: 4707.81 toks/s, output: 2020.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:44<00:21, 37.79it/s, est. speed input: 4761.90 toks/s, output: 2057.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:44<00:18, 44.16it/s, est. speed input: 4892.42 toks/s, output: 2135.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:44<00:15, 50.45it/s, est. speed input: 4983.43 toks/s, output: 2203.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:44<00:14, 54.82it/s, est. speed input: 5069.23 toks/s, output: 2247.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:45<00:12, 63.40it/s, est. speed input: 5209.56 toks/s, output: 2324.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:45<00:15, 49.25it/s, est. speed input: 5275.23 toks/s, output: 2366.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:45<00:12, 61.83it/s, est. speed input: 5409.62 toks/s, output: 2463.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:45<00:11, 65.29it/s, est. speed input: 5492.03 toks/s, output: 2511.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:45<00:08, 81.66it/s, est. speed input: 5732.42 toks/s, output: 2640.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:46<00:08, 85.46it/s, est. speed input: 5865.69 toks/s, output: 2719.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:46<00:08, 80.01it/s, est. speed input: 5946.37 toks/s, output: 2774.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:46<00:06, 99.86it/s, est. speed input: 6122.60 toks/s, output: 2882.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:46<00:05, 116.81it/s, est. speed input: 6307.95 toks/s, output: 2990.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:46<00:05, 120.23it/s, est. speed input: 6448.67 toks/s, output: 3083.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:46<00:05, 122.91it/s, est. speed input: 6575.32 toks/s, output: 3167.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:46<00:05, 116.19it/s, est. speed input: 6706.01 toks/s, output: 3272.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:46<00:04, 129.79it/s, est. speed input: 6886.74 toks/s, output: 3370.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:47<00:03, 152.48it/s, est. speed input: 7115.85 toks/s, output: 3524.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:47<00:04, 118.27it/s, est. speed input: 7274.84 toks/s, output: 3643.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:47<00:05, 89.77it/s, est. speed input: 7379.19 toks/s, output: 3731.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:47<00:06, 83.59it/s, est. speed input: 7492.47 toks/s, output: 3827.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:47<00:04, 103.06it/s, est. speed input: 7668.03 toks/s, output: 3974.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:48<00:04, 94.14it/s, est. speed input: 7785.74 toks/s, output: 4053.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:48<00:04, 102.90it/s, est. speed input: 7948.55 toks/s, output: 4167.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:48<00:04, 100.63it/s, est. speed input: 8061.71 toks/s, output: 4236.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:48<00:02, 135.87it/s, est. speed input: 8329.77 toks/s, output: 4423.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:48<00:02, 157.13it/s, est. speed input: 8548.80 toks/s, output: 4566.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:48<00:02, 125.77it/s, est. speed input: 8696.55 toks/s, output: 4673.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:49<00:02, 117.75it/s, est. speed input: 8805.80 toks/s, output: 4767.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:49<00:02, 116.26it/s, est. speed input: 8926.01 toks/s, output: 4846.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:49<00:02, 110.14it/s, est. speed input: 9036.80 toks/s, output: 4944.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:49<00:02, 103.79it/s, est. speed input: 9153.95 toks/s, output: 5032.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:49<00:03, 93.36it/s, est. speed input: 9255.68 toks/s, output: 5113.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:49<00:02, 103.35it/s, est. speed input: 9377.81 toks/s, output: 5223.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:50<00:02, 88.09it/s, est. speed input: 9475.49 toks/s, output: 5291.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:50<00:02, 90.23it/s, est. speed input: 9548.53 toks/s, output: 5348.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:50<00:02, 82.42it/s, est. speed input: 9644.46 toks/s, output: 5445.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:50<00:02, 80.07it/s, est. speed input: 9712.23 toks/s, output: 5508.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:50<00:03, 61.95it/s, est. speed input: 9751.38 toks/s, output: 5559.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:50<00:02, 67.84it/s, est. speed input: 9814.72 toks/s, output: 5629.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:51<00:01, 105.97it/s, est. speed input: 10084.88 toks/s, output: 5935.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:01, 95.23it/s, est. speed input: 10189.05 toks/s, output: 6054.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:51<00:01, 109.01it/s, est. speed input: 10345.89 toks/s, output: 6206.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:51<00:01, 107.71it/s, est. speed input: 10510.99 toks/s, output: 6324.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:51<00:01, 92.88it/s, est. speed input: 10612.37 toks/s, output: 6405.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:51<00:00, 92.52it/s, est. speed input: 10711.17 toks/s, output: 6496.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:52<00:01, 45.87it/s, est. speed input: 10671.00 toks/s, output: 6498.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:52<00:00, 62.96it/s, est. speed input: 10824.63 toks/s, output: 6659.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:52<00:00, 81.71it/s, est. speed input: 10984.65 toks/s, output: 6831.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:53<00:00, 48.38it/s, est. speed input: 10979.01 toks/s, output: 6876.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:54<00:00, 37.15it/s, est. speed input: 10982.39 toks/s, output: 6941.73 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:54<00:00, 23.62it/s, est. speed input: 11016.32 toks/s, output: 6981.83 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.8817986845970154, 'actor/pg_clipfrac': 0.0012277470668777823, 'actor/ppo_kl': -0.001366108888760209}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.33181673288345337, 'actor/pg_clipfrac': 0.003225806402042508, 'actor/ppo_kl': 5.1937309763161466e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.26828819513320923, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014362396905198693}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002504354633856565, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003163632354699075}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.49188709259033203, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000254820246482268}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.3091576099395752, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00127751927357167}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.24220293760299683, 'actor/pg_clipfrac': 0.0006020469591021538, 'actor/ppo_kl': -0.00030523203895427287}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.04759232699871063, 'actor/pg_clipfrac': 0.002098635770380497, 'actor/ppo_kl': 0.0004877948958892375}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.3503839671611786, 'actor/pg_clipfrac': 0.0009930486558005214, 'actor/ppo_kl': 0.00032996563822962344}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.1107923835515976, 'actor/pg_clipfrac': 0.0014566642930731177, 'actor/ppo_kl': -0.0008008808363229036}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0009793757926672697, 'actor/pg_clipfrac': 0.0027624310459941626, 'actor/ppo_kl': -0.0005854998016729951}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.1976647973060608, 'actor/pg_clipfrac': 0.0042432816699147224, 'actor/ppo_kl': -0.0007987420540302992}
[36m(Runner pid=3309020)[0m Step 49
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.244
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.021
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.013
[36m(Runner pid=3309020)[0m ppo_kl: 6.360651284023077e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.658
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.658
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 979518
[36m(Runner pid=3309020)[0m balanced_min: 978253
[36m(Runner pid=3309020)[0m max: 982177
[36m(Runner pid=3309020)[0m mean: 978885.5
[36m(Runner pid=3309020)[0m min: 975594
[36m(Runner pid=3309020)[0m minmax_diff: 6583
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.685
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.12
[36m(Runner pid=3309020)[0m throughput: 1131.503
[36m(Runner pid=3309020)[0m time_per_step: 865.12
[36m(Runner pid=3309020)[0m total_num_tokens: 1957771
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 466.953
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3041.0
[36m(Runner pid=3309020)[0m mean: 297.801
[36m(Runner pid=3309020)[0m min: 65.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.316
[36m(Runner pid=3309020)[0m format: 0.999
[36m(Runner pid=3309020)[0m overall: 0.658
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.25106302153999e-05
[36m(Runner pid=3309020)[0m gen: 0.152
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.289
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.181
[36m(Runner pid=3309020)[0m gen: 115.739
[36m(Runner pid=3309020)[0m old: 87.842
[36m(Runner pid=3309020)[0m ref: 88.938
[36m(Runner pid=3309020)[0m reward: 6.35
[36m(Runner pid=3309020)[0m step: 865.12
[36m(Runner pid=3309020)[0m update_actor: 565.478
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 50; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:41:48 [executor_base.py:219] It took 0.363242 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.76 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:43:12 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:41:48 [executor_base.py:219] It took 0.330949 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:43:13 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:43:13 [executor_base.py:208] It took 0.330245 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.84 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:43:28 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:43:28 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:43:28 [executor_base.py:208] It took 0.327121 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.06881148368120193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011275227880105376}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.15805649757385254, 'actor/pg_clipfrac': 0.0017074558418244123, 'actor/ppo_kl': -0.0011614548275247216}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.6803881525993347, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.587359530385584e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.18958629667758942, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00031822433811612427}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.821643590927124, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.12239792197942734, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.15967579185962677, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.06293430924415588, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.005406474228948355, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.25916481018066406, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.07884986698627472, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.24503502249717712, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.11414735019207001, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00021998352895025164}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.3614068329334259, 'actor/pg_clipfrac': 0.0005330490530468524, 'actor/ppo_kl': -0.0019128673011437058}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.14870941638946533, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.40745678544044495, 'actor/pg_clipfrac': 0.0011757789179682732, 'actor/ppo_kl': 0.0004969108267687261}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.5996936559677124, 'actor/pg_clipfrac': 0.000589622650295496, 'actor/ppo_kl': -0.0001317352580372244}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0003103394410572946, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00034104043152183294}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0156329944729805, 'actor/pg_clipfrac': 0.004070556256920099, 'actor/ppo_kl': 5.955482629360631e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.24988426268100739, 'actor/pg_clipfrac': 0.002970297122374177, 'actor/ppo_kl': 0.0003153168363496661}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.23316729068756104, 'actor/pg_clipfrac': 0.0012202562065795064, 'actor/ppo_kl': 0.00010823475895449519}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.4211163818836212, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001997147686779499}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.1835378259420395, 'actor/pg_clipfrac': 0.0005341880605556071, 'actor/ppo_kl': 0.0005328288534656167}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00043625387479551136, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017790126148611307}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.6455765962600708, 'actor/pg_clipfrac': 0.002270147670060396, 'actor/ppo_kl': -6.411375943571329e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.8441609144210815, 'actor/pg_clipfrac': 0.0010787486098706722, 'actor/ppo_kl': 0.0032392500434070826}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.5918237566947937, 'actor/pg_clipfrac': 0.005267778877168894, 'actor/ppo_kl': 0.0010895566083490849}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.5258346199989319, 'actor/pg_clipfrac': 0.0008841733215376735, 'actor/ppo_kl': 0.0001103715694625862}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00023526173026766628, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025116722099483013}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.27775558829307556, 'actor/pg_clipfrac': 0.0008833922329358757, 'actor/ppo_kl': -0.0009723316179588437}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3183521330356598, 'actor/pg_clipfrac': 0.002415458904579282, 'actor/ppo_kl': -9.067047358257696e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.11619206517934799, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00036885292502120137}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.44755125045776367, 'actor/pg_clipfrac': 0.0022512380965054035, 'actor/ppo_kl': 0.00045111458166502416}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.28140756487846375, 'actor/pg_clipfrac': 0.0009174311999231577, 'actor/ppo_kl': 0.00036382238613441586}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.34390997886657715, 'actor/pg_clipfrac': 0.0010460250778123736, 'actor/ppo_kl': 0.0013215531362220645}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0003111199475824833, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002990775683429092}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.24009770154953003, 'actor/pg_clipfrac': 0.0020429010037332773, 'actor/ppo_kl': -0.001723510562442243}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.01041032001376152, 'actor/pg_clipfrac': 0.0016420361353084445, 'actor/ppo_kl': 0.000697821844369173}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.5091114044189453, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003026800404768437}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.35737550258636475, 'actor/pg_clipfrac': 0.0007855459698475897, 'actor/ppo_kl': 0.0005178009741939604}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00045724931987933815, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00019523641094565392}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.5287584066390991, 'actor/pg_clipfrac': 0.0037842951714992523, 'actor/ppo_kl': -0.00362785835750401}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3474991023540497, 'actor/pg_clipfrac': 0.0036529679782688618, 'actor/ppo_kl': -0.0017201340524479747}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.22233834862709045, 'actor/pg_clipfrac': 0.0008460236713290215, 'actor/ppo_kl': -0.0005138706765137613}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.17723120748996735, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004564444534480572}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.05965906381607056, 'actor/pg_clipfrac': 0.0008936550584621727, 'actor/ppo_kl': 0.0008100998820737004}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:16<1:40:31, 16.04s/it, est. speed input: 29.98 toks/s, output: 6.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:16<26:22, 4.23s/it, est. speed input: 90.15 toks/s, output: 20.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 6/377 [00:16<10:26, 1.69s/it, est. speed input: 170.56 toks/s, output: 41.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 11/377 [00:16<04:18, 1.42it/s, est. speed input: 307.63 toks/s, output: 78.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 17/377 [00:16<02:09, 2.77it/s, est. speed input: 470.47 toks/s, output: 123.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 21/377 [00:16<01:31, 3.89it/s, est. speed input: 571.86 toks/s, output: 153.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 24/377 [00:17<01:11, 4.95it/s, est. speed input: 649.52 toks/s, output: 176.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 27/377 [00:17<00:55, 6.29it/s, est. speed input: 725.36 toks/s, output: 200.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 30/377 [00:17<00:43, 7.89it/s, est. speed input: 798.93 toks/s, output: 224.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 37/377 [00:17<00:25, 13.41it/s, est. speed input: 977.11 toks/s, output: 282.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 47/377 [00:17<00:14, 22.77it/s, est. speed input: 1234.12 toks/s, output: 368.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 52/377 [00:17<00:12, 25.68it/s, est. speed input: 1355.61 toks/s, output: 410.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 57/377 [00:17<00:13, 23.99it/s, est. speed input: 1466.09 toks/s, output: 450.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 67/377 [00:18<00:08, 34.71it/s, est. speed input: 1714.10 toks/s, output: 539.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 75/377 [00:18<00:07, 41.31it/s, est. speed input: 1908.25 toks/s, output: 612.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 86/377 [00:18<00:05, 50.24it/s, est. speed input: 2173.43 toks/s, output: 715.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 93/377 [00:18<00:06, 47.22it/s, est. speed input: 2328.83 toks/s, output: 777.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▋ | 99/377 [00:18<00:05, 48.42it/s, est. speed input: 2465.84 toks/s, output: 832.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 105/377 [00:18<00:05, 46.73it/s, est. speed input: 2596.96 toks/s, output: 887.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 118/377 [00:18<00:04, 60.06it/s, est. speed input: 2893.33 toks/s, output: 1015.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 125/377 [00:19<00:04, 59.11it/s, est. speed input: 3053.75 toks/s, output: 1083.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 133/377 [00:19<00:03, 61.24it/s, est. speed input: 3229.04 toks/s, output: 1163.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 145/377 [00:19<00:03, 72.51it/s, est. speed input: 3501.60 toks/s, output: 1287.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 153/377 [00:19<00:03, 61.62it/s, est. speed input: 3659.41 toks/s, output: 1366.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 162/377 [00:19<00:03, 66.03it/s, est. speed input: 3853.77 toks/s, output: 1462.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 170/377 [00:19<00:03, 67.58it/s, est. speed input: 4023.48 toks/s, output: 1546.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 178/377 [00:19<00:03, 63.24it/s, est. speed input: 4182.89 toks/s, output: 1628.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 185/377 [00:19<00:03, 60.89it/s, est. speed input: 4321.08 toks/s, output: 1702.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████▏ | 194/377 [00:20<00:02, 61.25it/s, est. speed input: 4501.27 toks/s, output: 1800.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 204/377 [00:20<00:02, 70.34it/s, est. speed input: 4715.65 toks/s, output: 1916.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 212/377 [00:20<00:02, 63.48it/s, est. speed input: 4867.18 toks/s, output: 2005.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 221/377 [00:20<00:02, 66.96it/s, est. speed input: 5045.72 toks/s, output: 2111.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 228/377 [00:20<00:02, 60.04it/s, est. speed input: 5169.53 toks/s, output: 2189.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 235/377 [00:20<00:02, 51.66it/s, est. speed input: 5281.94 toks/s, output: 2265.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 242/377 [00:20<00:02, 54.44it/s, est. speed input: 5413.85 toks/s, output: 2351.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 251/377 [00:21<00:02, 61.87it/s, est. speed input: 5593.03 toks/s, output: 2467.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 263/377 [00:21<00:01, 69.42it/s, est. speed input: 5825.98 toks/s, output: 2622.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 271/377 [00:21<00:01, 64.69it/s, est. speed input: 5966.07 toks/s, output: 2721.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 279/377 [00:21<00:01, 64.53it/s, est. speed input: 6107.30 toks/s, output: 2827.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 286/377 [00:21<00:01, 58.88it/s, est. speed input: 6222.47 toks/s, output: 2916.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 293/377 [00:21<00:01, 50.00it/s, est. speed input: 6321.74 toks/s, output: 3002.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 300/377 [00:21<00:01, 53.76it/s, est. speed input: 6443.56 toks/s, output: 3101.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 309/377 [00:22<00:01, 60.17it/s, est. speed input: 6602.69 toks/s, output: 3236.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 318/377 [00:22<00:00, 64.28it/s, est. speed input: 6760.12 toks/s, output: 3372.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 325/377 [00:22<00:01, 47.82it/s, est. speed input: 6831.57 toks/s, output: 3457.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 331/377 [00:22<00:01, 36.91it/s, est. speed input: 6877.25 toks/s, output: 3527.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 336/377 [00:22<00:01, 35.71it/s, est. speed input: 6933.51 toks/s, output: 3600.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 341/377 [00:23<00:01, 32.58it/s, est. speed input: 6980.16 toks/s, output: 3668.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:23<00:01, 29.19it/s, est. speed input: 7012.24 toks/s, output: 3721.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 349/377 [00:23<00:00, 30.23it/s, est. speed input: 7058.77 toks/s, output: 3786.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 356/377 [00:23<00:00, 37.92it/s, est. speed input: 7170.39 toks/s, output: 3918.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:23<00:00, 34.44it/s, est. speed input: 7220.91 toks/s, output: 3997.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 365/377 [00:23<00:00, 31.78it/s, est. speed input: 7258.35 toks/s, output: 4062.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 369/377 [00:24<00:00, 15.78it/s, est. speed input: 7151.28 toks/s, output: 4052.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:43<00:00, 15.78it/s, est. speed input: 6252.67 toks/s, output: 3589.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▊| 372/377 [00:51<00:09, 1.97s/it, est. speed input: 3436.50 toks/s, output: 2043.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [00:52<00:07, 1.95s/it, est. speed input: 3328.87 toks/s, output: 2051.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 375/377 [01:02<00:05, 2.52s/it, est. speed input: 2843.60 toks/s, output: 1901.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 377/377 [01:04<00:00, 2.15s/it, est. speed input: 2777.91 toks/s, output: 2019.17 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:04<00:00, 5.88it/s, est. speed input: 2777.91 toks/s, output: 2019.17 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.29127058386802673, 'actor/pg_clipfrac': 0.004166666883975267, 'actor/ppo_kl': -0.0003026346385013312}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.22964933514595032, 'actor/pg_clipfrac': 0.0018028846243396401, 'actor/ppo_kl': 0.0007022504578344524}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.4426763951778412, 'actor/pg_clipfrac': 0.0005780346691608429, 'actor/ppo_kl': 0.0015523260226473212}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.06333813816308975, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008547254255972803}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.3534943759441376, 'actor/pg_clipfrac': 0.0030674845911562443, 'actor/ppo_kl': 0.0004591210454236716}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0004120892845094204, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005392056773416698}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.4559079706668854, 'actor/pg_clipfrac': 0.0006648935959674418, 'actor/ppo_kl': -0.0010986480629071593}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 1.2337400913238525, 'actor/pg_clipfrac': 0.0007471980061382055, 'actor/ppo_kl': 0.00039171858225017786}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00031479494646191597, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00014430246665142477}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.023268923163414, 'actor/pg_clipfrac': 0.0005636978312395513, 'actor/ppo_kl': 0.0007583473925478756}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.3267441689968109, 'actor/pg_clipfrac': 0.0029850746504962444, 'actor/ppo_kl': 0.0007785484194755554}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.05983264744281769, 'actor/pg_clipfrac': 0.0008326394599862397, 'actor/ppo_kl': 0.0009713168838061392}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0001971318561118096, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00043884760816581547}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0006763444980606437, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009510915260761976}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.4276587665081024, 'actor/pg_clipfrac': 0.004052685108035803, 'actor/ppo_kl': -0.0016331484075635672}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0002059868857031688, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00016547899576835334}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0001960241061169654, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008670078823342919}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.29780706763267517, 'actor/pg_clipfrac': 0.00538599630817771, 'actor/ppo_kl': -0.002696008188650012}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:56:44 [executor_base.py:219] It took 0.342519 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:58:50 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:56:44 [executor_base.py:219] It took 0.344913 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:58:50 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 09:58:50 [executor_base.py:208] It took 0.328120 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:59:09 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:59:09 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 09:59:09 [executor_base.py:208] It took 0.325633 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to understand the geometric properties and constraints given in the problem. The perimeter of triangle ABC is 16.0, and point D is the midpoint of AB, with BD = 2.0. Since D is the midpoint of AB, AD = BD = 2.0. The line l is the perpendicular bisector of AB, meaning E is always equidistant from A and B, and the distance AE = BE.\n\nThe key insight here is that the perimeter of triangle AEC is minimized when E is such that AE + EC is minimized. Since E is on the line l, which is the perpendicular bisector of AB, the shortest distance from A to E and from E to C will be along the line segment AC. This means that AE + EC is minimized when E is at the point where the line l intersects AC, making AE + EC = AC.\n\nGiven that the perimeter of triangle ABC is 16.0 and AD = BD = 2.0, we can deduce the following:\n- AB = 2 + 2 = 4.\n- Since the perimeter of ABC is 16.0, AC + BC = 16.0 - 4 = 12.0.\n- Because E is on the perpendicular bisector of AB, the minimum perimeter of triangle AEC is AC.\n\nTherefore, the minimum perimeter of triangle AEC is AC, which is 12.0.\n\n\n\n12.0\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated clockwise around point A by 90.0 degrees to obtain triangle AB'C', we can infer the following:\n1. Since the rotation is 90.0 degrees, angle BAC is 90.0 degrees.\n2. After rotation, point B becomes point B', and point C becomes point C'.\n3. Since angle BAC is 90.0 degrees, angle B'AC' is also 90.0 degrees.\n4. We are given that angle CC'B' = 32.0 degrees.\n5. Since angle BAC is 90.0 degrees and angle B'AC' is also 90.0 degrees, angle AC'B' is the angle we need to find.\n6. The sum of angles in a triangle is 180.0 degrees. Therefore, angle AC'B' = 180.0 - angle B'AC' - angle CC'B' = 180.0 - 90.0 - 32.0 = 58.0 degrees.\n\n\nThe size of angle AC'B' is 58.0 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that A, B, and D are on circle O, and C is on the diameter BE, we can infer that angle ADC is equal to angle AEB because they subtend the same arc AD. Since angle ADC = 54.0 degrees, angle AEB must also be 54.0 degrees. This is because in a circle, an angle inscribed in a semicircle is a right angle, but since C is on the diameter BE, angle AEB is simply the angle subtended by arc AD.\n\n\n54.0 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AB is the diameter of circle O, angle A is 35 degrees. Since angle A is an angle in the right triangle OAD (where AD is the tangent and OA is the radius), angle AOD is 90 degrees. The angle C can be found by considering angle COD, which is the angle subtended by the same arc as angle A at the circumference. Since angle AOD is 90 degrees and angle A is 35 degrees, angle COD (which is twice angle A) is 70 degrees. Therefore, angle C is 90 degrees minus angle COD, which is 20 degrees.\n\n\n20 degrees\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, we know that angle ABD is 90 degrees because any angle subtended by a diameter in a semicircle is a right angle. Since EA is a tangent to the circle at point A, angle OAE is 90 degrees. Given that angle EAC is 120 degrees, we can find angle BAC by subtracting angle OAE from angle EAC, which gives us angle BAC = 120 degrees - 90 degrees = 30 degrees. Since angle ABC is an inscribed angle that subtends the same arc as angle BAC, it is half of the central angle OAC, which is 60 degrees. Therefore, angle ABC = 60 degrees / 2 = 30 degrees.\n\n\nThe degree of angle ABC is 30 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_35
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_50/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_50/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_50/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 50
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.261
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.023
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.007
[36m(Runner pid=3309020)[0m ppo_kl: 4.90032097815174e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.014
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.014
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.666
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.666
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 975613
[36m(Runner pid=3309020)[0m balanced_min: 974441
[36m(Runner pid=3309020)[0m max: 975517
[36m(Runner pid=3309020)[0m mean: 975027.0
[36m(Runner pid=3309020)[0m min: 974537
[36m(Runner pid=3309020)[0m minmax_diff: 980
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.877
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.12
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:18<1:20:10, 3.77s/it, est. speed input: 114.23 toks/s, output: 21.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:21<38:46, 1.83s/it, est. speed input: 209.36 toks/s, output: 40.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<29:43, 1.41s/it, est. speed input: 273.40 toks/s, output: 51.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<22:30, 1.07s/it, est. speed input: 330.06 toks/s, output: 70.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<16:42, 1.25it/s, est. speed input: 387.16 toks/s, output: 83.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<11:49, 1.76it/s, est. speed input: 451.96 toks/s, output: 105.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<10:52, 1.91it/s, est. speed input: 489.06 toks/s, output: 123.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:34<09:18, 2.22it/s, est. speed input: 531.75 toks/s, output: 139.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:34<06:46, 3.04it/s, est. speed input: 595.32 toks/s, output: 158.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<04:51, 4.21it/s, est. speed input: 660.60 toks/s, output: 182.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<03:44, 5.45it/s, est. speed input: 720.11 toks/s, output: 198.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<03:04, 6.61it/s, est. speed input: 777.45 toks/s, output: 217.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<02:25, 8.37it/s, est. speed input: 837.90 toks/s, output: 239.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<02:03, 9.83it/s, est. speed input: 895.49 toks/s, output: 259.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<01:58, 10.18it/s, est. speed input: 948.82 toks/s, output: 279.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:36<01:00, 19.51it/s, est. speed input: 1134.89 toks/s, output: 348.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:36<00:48, 24.32it/s, est. speed input: 1255.41 toks/s, output: 392.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:37<00:45, 25.78it/s, est. speed input: 1312.97 toks/s, output: 410.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:37<00:57, 20.34it/s, est. speed input: 1355.85 toks/s, output: 427.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:37<00:51, 22.63it/s, est. speed input: 1412.58 toks/s, output: 446.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<00:43, 26.50it/s, est. speed input: 1525.77 toks/s, output: 493.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:44, 25.85it/s, est. speed input: 1577.66 toks/s, output: 516.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<00:41, 27.78it/s, est. speed input: 1632.54 toks/s, output: 539.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:38, 29.63it/s, est. speed input: 1688.69 toks/s, output: 563.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:39<01:12, 15.56it/s, est. speed input: 1716.12 toks/s, output: 581.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:39<00:46, 24.06it/s, est. speed input: 1881.23 toks/s, output: 644.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:29, 37.34it/s, est. speed input: 2105.25 toks/s, output: 733.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:28, 37.96it/s, est. speed input: 2156.01 toks/s, output: 757.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:39<00:23, 45.42it/s, est. speed input: 2266.54 toks/s, output: 808.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:20, 52.73it/s, est. speed input: 2375.64 toks/s, output: 860.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:15, 68.35it/s, est. speed input: 2539.53 toks/s, output: 930.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:16, 64.94it/s, est. speed input: 2643.56 toks/s, output: 971.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:40<00:18, 57.04it/s, est. speed input: 2742.93 toks/s, output: 1018.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:41<00:22, 44.84it/s, est. speed input: 2883.30 toks/s, output: 1091.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:19, 51.64it/s, est. speed input: 2981.59 toks/s, output: 1146.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:17, 58.54it/s, est. speed input: 3085.02 toks/s, output: 1199.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:21, 46.89it/s, est. speed input: 3165.56 toks/s, output: 1238.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:41<00:17, 54.88it/s, est. speed input: 3276.01 toks/s, output: 1296.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:41<00:15, 62.78it/s, est. speed input: 3379.33 toks/s, output: 1340.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:41<00:11, 80.37it/s, est. speed input: 3530.01 toks/s, output: 1408.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:42<00:11, 84.47it/s, est. speed input: 3633.76 toks/s, output: 1464.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:14, 63.83it/s, est. speed input: 3726.02 toks/s, output: 1506.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:42<00:16, 56.57it/s, est. speed input: 3907.38 toks/s, output: 1601.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:42<00:13, 66.00it/s, est. speed input: 4062.25 toks/s, output: 1680.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:43<00:11, 76.57it/s, est. speed input: 4264.81 toks/s, output: 1795.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:43<00:14, 61.07it/s, est. speed input: 4341.57 toks/s, output: 1838.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:43<00:21, 40.15it/s, est. speed input: 4390.68 toks/s, output: 1876.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:44<00:18, 44.91it/s, est. speed input: 4520.73 toks/s, output: 1942.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:14, 56.67it/s, est. speed input: 4758.09 toks/s, output: 2078.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:44<00:11, 72.46it/s, est. speed input: 4946.42 toks/s, output: 2186.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:44<00:14, 54.85it/s, est. speed input: 5002.88 toks/s, output: 2228.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:45<00:13, 56.47it/s, est. speed input: 5086.09 toks/s, output: 2279.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:45<00:12, 61.35it/s, est. speed input: 5175.71 toks/s, output: 2333.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:45<00:09, 82.53it/s, est. speed input: 5365.67 toks/s, output: 2433.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:45<00:08, 85.77it/s, est. speed input: 5493.56 toks/s, output: 2502.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:45<00:07, 94.91it/s, est. speed input: 5627.48 toks/s, output: 2582.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:45<00:09, 71.01it/s, est. speed input: 5735.23 toks/s, output: 2638.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:13, 53.72it/s, est. speed input: 5796.21 toks/s, output: 2671.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:46<00:11, 57.61it/s, est. speed input: 5874.11 toks/s, output: 2735.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:46<00:13, 48.91it/s, est. speed input: 5938.02 toks/s, output: 2782.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:46<00:11, 56.63it/s, est. speed input: 6073.74 toks/s, output: 2869.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:46<00:10, 61.25it/s, est. speed input: 6159.68 toks/s, output: 2921.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:47<00:09, 65.64it/s, est. speed input: 6242.76 toks/s, output: 2966.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:47<00:08, 75.21it/s, est. speed input: 6362.00 toks/s, output: 3052.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:47<00:08, 69.81it/s, est. speed input: 6440.73 toks/s, output: 3087.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:47<00:09, 66.25it/s, est. speed input: 6514.43 toks/s, output: 3139.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:47<00:05, 112.31it/s, est. speed input: 6786.37 toks/s, output: 3322.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:47<00:05, 100.38it/s, est. speed input: 6945.53 toks/s, output: 3431.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:48<00:05, 102.39it/s, est. speed input: 7116.42 toks/s, output: 3557.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:48<00:04, 117.67it/s, est. speed input: 7287.60 toks/s, output: 3677.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:48<00:04, 116.78it/s, est. speed input: 7407.45 toks/s, output: 3759.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:48<00:04, 116.22it/s, est. speed input: 7550.65 toks/s, output: 3879.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:48<00:04, 117.23it/s, est. speed input: 7670.96 toks/s, output: 3967.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:48<00:03, 130.79it/s, est. speed input: 7838.34 toks/s, output: 4093.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:48<00:03, 130.28it/s, est. speed input: 7960.46 toks/s, output: 4162.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:49<00:03, 123.75it/s, est. speed input: 8086.35 toks/s, output: 4257.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:49<00:03, 130.82it/s, est. speed input: 8259.47 toks/s, output: 4385.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:49<00:03, 125.33it/s, est. speed input: 8377.06 toks/s, output: 4492.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:49<00:02, 128.75it/s, est. speed input: 8536.96 toks/s, output: 4602.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:49<00:02, 131.12it/s, est. speed input: 8656.33 toks/s, output: 4692.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:49<00:02, 154.14it/s, est. speed input: 8859.33 toks/s, output: 4845.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:49<00:02, 130.07it/s, est. speed input: 9009.24 toks/s, output: 4963.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:49<00:01, 153.75it/s, est. speed input: 9220.30 toks/s, output: 5128.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:50<00:01, 152.23it/s, est. speed input: 9375.87 toks/s, output: 5266.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:50<00:01, 156.93it/s, est. speed input: 9540.45 toks/s, output: 5379.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:50<00:01, 147.68it/s, est. speed input: 9698.58 toks/s, output: 5500.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:50<00:01, 151.84it/s, est. speed input: 9864.40 toks/s, output: 5630.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:50<00:01, 134.74it/s, est. speed input: 10022.05 toks/s, output: 5784.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:50<00:01, 131.80it/s, est. speed input: 10136.54 toks/s, output: 5892.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:01, 114.88it/s, est. speed input: 10280.73 toks/s, output: 6044.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:51<00:01, 111.80it/s, est. speed input: 10384.71 toks/s, output: 6139.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:51<00:01, 110.54it/s, est. speed input: 10492.72 toks/s, output: 6259.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:51<00:00, 125.36it/s, est. speed input: 10650.42 toks/s, output: 6401.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:51<00:00, 129.71it/s, est. speed input: 10768.06 toks/s, output: 6505.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:51<00:00, 103.79it/s, est. speed input: 10844.19 toks/s, output: 6595.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:52<00:00, 62.13it/s, est. speed input: 10873.93 toks/s, output: 6671.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:52<00:00, 46.03it/s, est. speed input: 10870.02 toks/s, output: 6695.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:52<00:00, 50.46it/s, est. speed input: 10931.61 toks/s, output: 6791.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:53<00:00, 29.14it/s, est. speed input: 10858.19 toks/s, output: 6796.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 16.95it/s, est. speed input: 10687.27 toks/s, output: 6742.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:55<00:00, 18.74it/s, est. speed input: 10693.58 toks/s, output: 6797.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 12.47it/s, est. speed input: 10531.23 toks/s, output: 6726.37 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.72it/s, est. speed input: 10531.23 toks/s, output: 6726.37 toks/s]
[36m(Runner pid=3309020)[0m throughput: 891.329
[36m(Runner pid=3309020)[0m time_per_step: 1093.902
[36m(Runner pid=3309020)[0m total_num_tokens: 1950054
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 466.191
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3018.0
[36m(Runner pid=3309020)[0m mean: 295.548
[36m(Runner pid=3309020)[0m min: 58.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.334
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.666
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.446390303350821e-05
[36m(Runner pid=3309020)[0m gen: 0.152
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.289
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.184
[36m(Runner pid=3309020)[0m gen: 114.718
[36m(Runner pid=3309020)[0m old: 86.767
[36m(Runner pid=3309020)[0m ref: 88.659
[36m(Runner pid=3309020)[0m reward: 6.556
[36m(Runner pid=3309020)[0m save_checkpoint: 31.379
[36m(Runner pid=3309020)[0m step: 1093.902
[36m(Runner pid=3309020)[0m update_actor: 564.421
[36m(Runner pid=3309020)[0m validation: 200.597
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.401
[36m(Runner pid=3309020)[0m format_reward: 0.973
[36m(Runner pid=3309020)[0m overall_reward: 0.688
[36m(Runner pid=3309020)[0m reward_score: 0.688
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.98
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_50/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_50/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_50/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 51; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:00:04 [executor_base.py:219] It took 0.341055 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:01:33 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:00:04 [executor_base.py:219] It took 0.340482 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:01:34 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:01:34 [executor_base.py:208] It took 0.327746 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:01:34 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:01:34 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:01:34 [executor_base.py:208] It took 0.325637 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00032663598540239036, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015989114763215184}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0003197821497451514, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0004089733411092311, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.32480496168136597, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.7077470421791077, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.2195003777742386, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0003111596161033958, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001001407508738339}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.07136811316013336, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000828940945211798}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.4065779149532318, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002441992110107094, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012021706206724048}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3864770829677582, 'actor/pg_clipfrac': 0.001640689093619585, 'actor/ppo_kl': 0.0013581449165940285}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0842270702123642, 'actor/pg_clipfrac': 0.0014641288435086608, 'actor/ppo_kl': 0.0006842745933681726}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.12068036198616028, 'actor/pg_clipfrac': 0.000649772584438324, 'actor/ppo_kl': -0.0008936987724155188}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003427445190027356, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.15483109652996063, 'actor/pg_clipfrac': 0.0027816412039101124, 'actor/ppo_kl': -0.0005470694741234183}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0003352957428433001, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010499964701011777}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.4748721122741699, 'actor/pg_clipfrac': 0.0030395137146115303, 'actor/ppo_kl': -0.0017110699554905295}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0001521246158517897, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001256638643098995}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.02642451785504818, 'actor/pg_clipfrac': 0.0031914892606437206, 'actor/ppo_kl': -0.0007444422226399183}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0003123614878859371, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00043197436025366187}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002795848122332245, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005659792805090547}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0006152259302325547, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001684777089394629}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.635040819644928, 'actor/pg_clipfrac': 0.0009259259095415473, 'actor/ppo_kl': 0.0026324200443923473}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.029559915885329247, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00045375822810456157}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.39053428173065186, 'actor/pg_clipfrac': 0.0011641443707048893, 'actor/ppo_kl': 0.00043808401096612215}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.05352349579334259, 'actor/pg_clipfrac': 0.0008841733215376735, 'actor/ppo_kl': -0.0002854125341400504}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3510294258594513, 'actor/pg_clipfrac': 0.0022014309652149677, 'actor/ppo_kl': 0.0005723232170566916}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0002594389079604298, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008964920998550951}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.2283487170934677, 'actor/pg_clipfrac': 0.002594033721834421, 'actor/ppo_kl': -0.0022024111822247505}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.2185913473367691, 'actor/pg_clipfrac': 0.0017123287543654442, 'actor/ppo_kl': 6.373505311785266e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00034619096550159156, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00033728600828908384}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.3186761438846588, 'actor/pg_clipfrac': 0.001855287584476173, 'actor/ppo_kl': -0.0006359132239595056}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.15093038976192474, 'actor/pg_clipfrac': 0.0017137960530817509, 'actor/ppo_kl': 0.0005148092750459909}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.4858616590499878, 'actor/pg_clipfrac': 0.0041631972417235374, 'actor/ppo_kl': -0.00022576273477170616}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.13551978766918182, 'actor/pg_clipfrac': 0.0006531678372994065, 'actor/ppo_kl': -0.0008613828103989363}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.089377760887146, 'actor/pg_clipfrac': 0.002595155732706189, 'actor/ppo_kl': -0.00013753370149061084}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002778038033284247, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00029736378928646445}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.10421767830848694, 'actor/pg_clipfrac': 0.0021367522422224283, 'actor/ppo_kl': 0.0012133111013099551}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0003408925549592823, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002385516418144107}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.15517275035381317, 'actor/pg_clipfrac': 0.0023121386766433716, 'actor/ppo_kl': -0.0021033943630754948}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0001462777581764385, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011192021192982793}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.1604911834001541, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007461906061507761}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3816218078136444, 'actor/pg_clipfrac': 0.0007836990407668054, 'actor/ppo_kl': -0.0008664385532028973}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.7306721210479736, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00028605046099983156}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.19218094646930695, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009822616120800376}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.048090558499097824, 'actor/pg_clipfrac': 0.0014545454178005457, 'actor/ppo_kl': -0.0005406022537499666}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0002718487230595201, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00032084633130580187}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.13214673101902008, 'actor/pg_clipfrac': 0.0025402200408279896, 'actor/ppo_kl': 7.505037501687184e-06}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.20235216617584229, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.003622565884143114}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.024133380502462387, 'actor/pg_clipfrac': 0.0017590149072930217, 'actor/ppo_kl': 0.0012429202906787395}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00020350424165371805, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007946460973471403}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.8669939637184143, 'actor/pg_clipfrac': 0.005173688288778067, 'actor/ppo_kl': -0.0010939229978248477}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0005676904693245888, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002722220087889582}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.7336317896842957, 'actor/pg_clipfrac': 0.0007037297473289073, 'actor/ppo_kl': 0.001482193823903799}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.37794724106788635, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006173317087814212}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.28241783380508423, 'actor/pg_clipfrac': 0.006701414939016104, 'actor/ppo_kl': -0.0018265227554365993}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0397527776658535, 'actor/pg_clipfrac': 0.004683840554207563, 'actor/ppo_kl': 0.0006503388867713511}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.43998289108276367, 'actor/pg_clipfrac': 0.001010101055726409, 'actor/ppo_kl': -0.0011204478796571493}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00042536231921985745, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001071386388503015}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.163103848695755, 'actor/pg_clipfrac': 0.0017809439450502396, 'actor/ppo_kl': -0.0001011370331980288}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00027384149143472314, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.167294900980778e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.06188798323273659, 'actor/pg_clipfrac': 0.00272479560226202, 'actor/ppo_kl': -0.0014085250440984964}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:19<1:21:11, 3.82s/it, est. speed input: 118.83 toks/s, output: 21.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<43:05, 2.04s/it, est. speed input: 200.12 toks/s, output: 44.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<27:51, 1.32s/it, est. speed input: 271.74 toks/s, output: 66.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:29<23:53, 1.14s/it, est. speed input: 308.38 toks/s, output: 79.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<17:46, 1.18it/s, est. speed input: 366.32 toks/s, output: 94.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<09:19, 2.23it/s, est. speed input: 495.85 toks/s, output: 140.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:32<07:43, 2.67it/s, est. speed input: 556.15 toks/s, output: 160.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<06:18, 3.26it/s, est. speed input: 616.08 toks/s, output: 177.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<04:56, 4.15it/s, est. speed input: 674.60 toks/s, output: 202.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:33<03:41, 5.53it/s, est. speed input: 737.52 toks/s, output: 222.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:34<03:44, 5.44it/s, est. speed input: 781.34 toks/s, output: 237.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<02:54, 6.97it/s, est. speed input: 837.31 toks/s, output: 261.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:35<01:59, 10.10it/s, est. speed input: 958.41 toks/s, output: 307.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:36<01:14, 15.97it/s, est. speed input: 1141.41 toks/s, output: 374.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:36<01:10, 16.90it/s, est. speed input: 1198.27 toks/s, output: 395.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:36<01:13, 16.03it/s, est. speed input: 1246.37 toks/s, output: 413.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<00:58, 20.08it/s, est. speed input: 1360.76 toks/s, output: 459.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:37<00:52, 22.09it/s, est. speed input: 1418.83 toks/s, output: 479.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:37<01:14, 15.53it/s, est. speed input: 1458.46 toks/s, output: 493.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:37<00:50, 22.62it/s, est. speed input: 1575.64 toks/s, output: 538.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:34, 33.23it/s, est. speed input: 1743.11 toks/s, output: 610.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:38<00:36, 31.24it/s, est. speed input: 1791.65 toks/s, output: 632.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:37, 29.74it/s, est. speed input: 1838.56 toks/s, output: 658.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:38<00:49, 22.76it/s, est. speed input: 1882.92 toks/s, output: 678.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:39<00:41, 27.02it/s, est. speed input: 1988.33 toks/s, output: 724.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:39<00:44, 24.91it/s, est. speed input: 2028.87 toks/s, output: 745.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:39, 27.52it/s, est. speed input: 2082.47 toks/s, output: 774.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:39, 27.59it/s, est. speed input: 2130.91 toks/s, output: 795.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:23, 46.53it/s, est. speed input: 2303.48 toks/s, output: 871.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:25, 41.35it/s, est. speed input: 2402.04 toks/s, output: 919.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:23, 45.44it/s, est. speed input: 2506.74 toks/s, output: 971.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:19, 53.43it/s, est. speed input: 2609.49 toks/s, output: 1013.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:40<00:16, 63.08it/s, est. speed input: 2775.96 toks/s, output: 1086.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:40<00:18, 56.35it/s, est. speed input: 2877.34 toks/s, output: 1128.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:40<00:16, 62.89it/s, est. speed input: 2982.49 toks/s, output: 1172.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:17, 56.32it/s, est. speed input: 3074.23 toks/s, output: 1216.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:20, 48.71it/s, est. speed input: 3168.83 toks/s, output: 1261.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:41<00:18, 52.40it/s, est. speed input: 3272.30 toks/s, output: 1306.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:41<00:17, 55.42it/s, est. speed input: 3370.96 toks/s, output: 1362.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:42<00:24, 40.03it/s, est. speed input: 3448.66 toks/s, output: 1406.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:42<00:23, 41.29it/s, est. speed input: 3495.94 toks/s, output: 1421.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:42<00:20, 47.09it/s, est. speed input: 3585.71 toks/s, output: 1472.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:25, 36.60it/s, est. speed input: 3655.49 toks/s, output: 1519.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:42<00:16, 56.19it/s, est. speed input: 3851.24 toks/s, output: 1613.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:43<00:17, 51.23it/s, est. speed input: 3934.92 toks/s, output: 1663.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:12, 69.70it/s, est. speed input: 4134.66 toks/s, output: 1786.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:12, 69.92it/s, est. speed input: 4226.90 toks/s, output: 1837.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:15, 57.59it/s, est. speed input: 4307.70 toks/s, output: 1877.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:43<00:12, 69.59it/s, est. speed input: 4455.59 toks/s, output: 1939.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:43<00:12, 66.55it/s, est. speed input: 4544.19 toks/s, output: 1983.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:44<00:12, 69.18it/s, est. speed input: 4635.80 toks/s, output: 2037.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:12, 66.29it/s, est. speed input: 4724.38 toks/s, output: 2085.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:44<00:11, 69.67it/s, est. speed input: 4815.15 toks/s, output: 2135.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:44<00:09, 85.98it/s, est. speed input: 5012.19 toks/s, output: 2248.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:44<00:10, 78.12it/s, est. speed input: 5094.87 toks/s, output: 2298.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:44<00:12, 63.48it/s, est. speed input: 5170.59 toks/s, output: 2349.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:45<00:13, 55.67it/s, est. speed input: 5248.06 toks/s, output: 2388.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:45<00:17, 42.48it/s, est. speed input: 5304.43 toks/s, output: 2418.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:45<00:20, 36.15it/s, est. speed input: 5359.75 toks/s, output: 2453.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:46<00:12, 56.87it/s, est. speed input: 5543.87 toks/s, output: 2573.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:46<00:11, 63.50it/s, est. speed input: 5626.27 toks/s, output: 2627.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:46<00:09, 75.32it/s, est. speed input: 5848.46 toks/s, output: 2773.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:46<00:09, 69.25it/s, est. speed input: 5929.05 toks/s, output: 2828.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:46<00:10, 64.93it/s, est. speed input: 6009.08 toks/s, output: 2873.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:46<00:09, 68.44it/s, est. speed input: 6092.40 toks/s, output: 2930.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:47<00:09, 71.73it/s, est. speed input: 6177.52 toks/s, output: 2996.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:47<00:07, 84.66it/s, est. speed input: 6313.86 toks/s, output: 3072.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:47<00:06, 95.43it/s, est. speed input: 6441.00 toks/s, output: 3159.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:47<00:05, 104.09it/s, est. speed input: 6569.03 toks/s, output: 3268.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:47<00:05, 111.30it/s, est. speed input: 6699.92 toks/s, output: 3344.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:47<00:05, 96.51it/s, est. speed input: 6859.11 toks/s, output: 3479.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:48<00:06, 80.36it/s, est. speed input: 6961.91 toks/s, output: 3551.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:48<00:06, 89.84it/s, est. speed input: 7094.60 toks/s, output: 3644.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:48<00:06, 80.94it/s, est. speed input: 7211.29 toks/s, output: 3723.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:48<00:05, 91.86it/s, est. speed input: 7337.16 toks/s, output: 3799.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:48<00:05, 84.58it/s, est. speed input: 7451.38 toks/s, output: 3887.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:48<00:05, 83.10it/s, est. speed input: 7522.16 toks/s, output: 3935.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:48<00:03, 117.51it/s, est. speed input: 7783.37 toks/s, output: 4107.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:49<00:03, 119.36it/s, est. speed input: 7911.47 toks/s, output: 4202.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:49<00:04, 103.60it/s, est. speed input: 8025.02 toks/s, output: 4280.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:49<00:03, 109.20it/s, est. speed input: 8148.40 toks/s, output: 4371.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:49<00:03, 105.38it/s, est. speed input: 8275.52 toks/s, output: 4456.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:03, 96.65it/s, est. speed input: 8389.22 toks/s, output: 4541.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:49<00:03, 107.58it/s, est. speed input: 8547.52 toks/s, output: 4652.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:50<00:04, 81.57it/s, est. speed input: 8632.82 toks/s, output: 4739.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:50<00:04, 70.86it/s, est. speed input: 8689.17 toks/s, output: 4782.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:50<00:03, 89.11it/s, est. speed input: 8923.67 toks/s, output: 4980.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:50<00:02, 97.61it/s, est. speed input: 9042.37 toks/s, output: 5087.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:50<00:03, 87.99it/s, est. speed input: 9142.30 toks/s, output: 5181.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:51<00:03, 84.65it/s, est. speed input: 9206.80 toks/s, output: 5238.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:51<00:02, 113.86it/s, est. speed input: 9446.13 toks/s, output: 5457.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:51<00:02, 107.03it/s, est. speed input: 9549.65 toks/s, output: 5560.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:51<00:01, 108.37it/s, est. speed input: 9654.81 toks/s, output: 5664.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:51<00:01, 115.49it/s, est. speed input: 9775.83 toks/s, output: 5790.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:51<00:01, 93.96it/s, est. speed input: 9865.12 toks/s, output: 5875.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:52<00:02, 73.89it/s, est. speed input: 9938.87 toks/s, output: 5975.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:52<00:01, 82.08it/s, est. speed input: 10043.45 toks/s, output: 6076.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:52<00:01, 92.93it/s, est. speed input: 10155.89 toks/s, output: 6198.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:52<00:01, 65.25it/s, est. speed input: 10213.72 toks/s, output: 6269.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:52<00:01, 84.20it/s, est. speed input: 10366.25 toks/s, output: 6431.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:53<00:01, 73.03it/s, est. speed input: 10443.27 toks/s, output: 6527.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:53<00:01, 60.84it/s, est. speed input: 10479.73 toks/s, output: 6583.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:53<00:01, 58.60it/s, est. speed input: 10530.87 toks/s, output: 6639.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:53<00:00, 61.55it/s, est. speed input: 10582.12 toks/s, output: 6704.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:54<00:00, 54.94it/s, est. speed input: 10618.24 toks/s, output: 6764.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:54<00:00, 57.79it/s, est. speed input: 10672.75 toks/s, output: 6855.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:54<00:00, 64.99it/s, est. speed input: 10742.85 toks/s, output: 6958.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:54<00:00, 44.68it/s, est. speed input: 10752.98 toks/s, output: 7023.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:55<00:00, 38.60it/s, est. speed input: 10765.07 toks/s, output: 7082.55 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:55<00:00, 23.23it/s, est. speed input: 10765.07 toks/s, output: 7082.55 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.07477987557649612, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011680088937282562}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.10494732111692429, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012317558284848928}
[36m(Runner pid=3309020)[0m Step 51
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.296
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.023
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.004
[36m(Runner pid=3309020)[0m ppo_kl: -6.719996506454607e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.009
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.009
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.677
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.677
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 970652
[36m(Runner pid=3309020)[0m balanced_min: 970652
[36m(Runner pid=3309020)[0m max: 972376
[36m(Runner pid=3309020)[0m mean: 970652.0
[36m(Runner pid=3309020)[0m min: 968928
[36m(Runner pid=3309020)[0m minmax_diff: 3448
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.773
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.119
[36m(Runner pid=3309020)[0m throughput: 1144.972
[36m(Runner pid=3309020)[0m time_per_step: 847.752
[36m(Runner pid=3309020)[0m total_num_tokens: 1941304
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 626.0
[36m(Runner pid=3309020)[0m mean: 463.033
[36m(Runner pid=3309020)[0m min: 413.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1281.0
[36m(Runner pid=3309020)[0m mean: 295.289
[36m(Runner pid=3309020)[0m min: 47.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.355
[36m(Runner pid=3309020)[0m format: 0.999
[36m(Runner pid=3309020)[0m overall: 0.677
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.700877347801863e-05
[36m(Runner pid=3309020)[0m gen: 0.14
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.289
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.169
[36m(Runner pid=3309020)[0m gen: 105.923
[36m(Runner pid=3309020)[0m old: 87.197
[36m(Runner pid=3309020)[0m ref: 86.909
[36m(Runner pid=3309020)[0m reward: 6.097
[36m(Runner pid=3309020)[0m step: 847.752
[36m(Runner pid=3309020)[0m update_actor: 560.844
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 52; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:14:11 [executor_base.py:219] It took 0.339454 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.71 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:14:11 [executor_base.py:219] It took 0.341426 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:15:37 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:15:37 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.79 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:15:37 [executor_base.py:208] It took 0.325535 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.79 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:15:42 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:15:42 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:15:42 [executor_base.py:208] It took 0.327066 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1405307799577713, 'actor/pg_clipfrac': 0.0013227512827143073, 'actor/ppo_kl': 0.0004035253368783742}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.2984468638896942, 'actor/pg_clipfrac': 0.0020140986889600754, 'actor/ppo_kl': -0.0007866132655180991}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.1196763664484024, 'actor/pg_clipfrac': 0.0016949152341112494, 'actor/ppo_kl': -0.0004474640008993447}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.8269575238227844, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.7322760224342346, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018409286858513951}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.12992338836193085, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.21213264763355255, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -2.2257352611632086e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.10041068494319916, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00045033940114080906}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.07518996298313141, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.6536440253257751, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006240516086108983}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003611476277001202, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012748149456456304}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0004566638090182096, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.31649431586265564, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00033962674206122756, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.07261960953474045, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0045896670781075954, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004417161690071225}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.1282084584236145, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005425179842859507}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.3207635283470154, 'actor/pg_clipfrac': 0.002063273685052991, 'actor/ppo_kl': 0.000588228169362992}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.2813536524772644, 'actor/pg_clipfrac': 0.0006807352183386683, 'actor/ppo_kl': -0.00036166782956570387}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.36555296182632446, 'actor/pg_clipfrac': 0.0007230658084154129, 'actor/ppo_kl': -0.0015743308467790484}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.112877756357193, 'actor/pg_clipfrac': 0.000718907278496772, 'actor/ppo_kl': -0.0007813849369995296}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003246848937124014, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003973931889049709}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0001796433498384431, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00017997185932472348}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.07940912991762161, 'actor/pg_clipfrac': 0.0010834236163645983, 'actor/ppo_kl': -0.0010911873541772366}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.34182479977607727, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000886310066562146}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00038987724110484123, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008279519970528781}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00018495427502784878, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00090171885676682}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.40967538952827454, 'actor/pg_clipfrac': 0.0009199631749652326, 'actor/ppo_kl': 0.0002367867127759382}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00034676905488595366, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000701010983902961}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.22654585540294647, 'actor/pg_clipfrac': 0.001372997765429318, 'actor/ppo_kl': 0.00021581497276201844}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.08594302088022232, 'actor/pg_clipfrac': 0.0020140986889600754, 'actor/ppo_kl': 0.0009571010014042258}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0005404046969488263, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00020846392726525664}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.17433376610279083, 'actor/pg_clipfrac': 0.0017123287543654442, 'actor/ppo_kl': 0.0005757890176028013}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.269974946975708, 'actor/pg_clipfrac': 0.0004621072148438543, 'actor/ppo_kl': 0.0004805557837244123}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00022604948026128113, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000504370778799057}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0005851694149896502, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005491457995958626}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.2402734011411667, 'actor/pg_clipfrac': 0.0013157895300537348, 'actor/ppo_kl': 0.0009885235922411084}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00039892803761176765, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005300216726027429}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002893874188885093, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002638555597513914}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.4418468475341797, 'actor/pg_clipfrac': 0.0010905124945566058, 'actor/ppo_kl': 0.001148585812188685}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.27987149357795715, 'actor/pg_clipfrac': 0.0004782400792464614, 'actor/ppo_kl': 0.0011958564864471555}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.4581662714481354, 'actor/pg_clipfrac': 0.0020020019728690386, 'actor/ppo_kl': 0.0003778001409955323}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.18914538621902466, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001185471992357634}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.08363240957260132, 'actor/pg_clipfrac': 0.0015090543311089277, 'actor/ppo_kl': 0.00025286184973083436}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00023534421052318066, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007347059436142445}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2627379894256592, 'actor/pg_clipfrac': 0.0016220599645748734, 'actor/ppo_kl': -0.0004081664083059877}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3271923065185547, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012304699048399925}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.04487199708819389, 'actor/pg_clipfrac': 0.002176278503611684, 'actor/ppo_kl': 0.0003326362057123333}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.09723591804504395, 'actor/pg_clipfrac': 0.002785515272989869, 'actor/ppo_kl': 0.0007641601259820163}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.18362800776958466, 'actor/pg_clipfrac': 0.0011037527583539486, 'actor/ppo_kl': -0.0009786098962649703}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.15514174103736877, 'actor/pg_clipfrac': 0.0016771488590165973, 'actor/ppo_kl': 0.0005246652290225029}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.008135543204843998, 'actor/pg_clipfrac': 0.002613240387290716, 'actor/ppo_kl': -0.00048554898239672184}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.8511494994163513, 'actor/pg_clipfrac': 0.002290076343342662, 'actor/ppo_kl': 0.0012655039317905903}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.16718387603759766, 'actor/pg_clipfrac': 0.0006172839784994721, 'actor/ppo_kl': 0.0006720919627696276}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002462461416143924, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010936420876532793}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:20<1:25:32, 4.03s/it, est. speed input: 115.76 toks/s, output: 25.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:20<36:54, 1.74s/it, est. speed input: 224.10 toks/s, output: 50.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:29<36:56, 1.75s/it, est. speed input: 236.92 toks/s, output: 60.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:31<25:31, 1.22s/it, est. speed input: 295.05 toks/s, output: 79.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<12:15, 1.70it/s, est. speed input: 436.55 toks/s, output: 128.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<09:59, 2.08it/s, est. speed input: 493.16 toks/s, output: 145.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<07:24, 2.79it/s, est. speed input: 558.18 toks/s, output: 168.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:34<07:13, 2.85it/s, est. speed input: 595.08 toks/s, output: 179.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<04:04, 5.00it/s, est. speed input: 724.85 toks/s, output: 229.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<03:18, 6.15it/s, est. speed input: 784.16 toks/s, output: 252.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<02:36, 7.77it/s, est. speed input: 842.58 toks/s, output: 277.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<02:08, 9.38it/s, est. speed input: 901.53 toks/s, output: 297.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:35<01:05, 18.19it/s, est. speed input: 1094.13 toks/s, output: 365.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:36<01:14, 16.04it/s, est. speed input: 1141.20 toks/s, output: 388.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:36<01:09, 17.09it/s, est. speed input: 1199.80 toks/s, output: 410.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:36<00:55, 21.15it/s, est. speed input: 1315.75 toks/s, output: 449.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<00:50, 23.01it/s, est. speed input: 1373.85 toks/s, output: 473.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<01:24, 13.68it/s, est. speed input: 1449.90 toks/s, output: 502.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:38<01:26, 13.35it/s, est. speed input: 1492.35 toks/s, output: 529.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<01:13, 15.72it/s, est. speed input: 1548.06 toks/s, output: 554.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<01:06, 17.33it/s, est. speed input: 1601.84 toks/s, output: 579.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:44, 25.54it/s, est. speed input: 1718.18 toks/s, output: 632.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:39<00:35, 31.31it/s, est. speed input: 1829.02 toks/s, output: 680.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:28, 39.35it/s, est. speed input: 1939.24 toks/s, output: 728.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:39<00:30, 36.73it/s, est. speed input: 2038.82 toks/s, output: 776.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:32, 33.40it/s, est. speed input: 2136.19 toks/s, output: 828.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:36, 29.98it/s, est. speed input: 2231.85 toks/s, output: 864.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:30, 35.28it/s, est. speed input: 2335.40 toks/s, output: 918.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:29, 36.53it/s, est. speed input: 2387.68 toks/s, output: 942.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:40<00:23, 45.50it/s, est. speed input: 2545.89 toks/s, output: 1016.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:41<00:18, 55.97it/s, est. speed input: 2698.26 toks/s, output: 1083.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:41<00:22, 45.74it/s, est. speed input: 2788.61 toks/s, output: 1128.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:41<00:17, 58.98it/s, est. speed input: 3015.38 toks/s, output: 1249.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:13, 72.55it/s, est. speed input: 3174.43 toks/s, output: 1338.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:41<00:14, 70.27it/s, est. speed input: 3276.08 toks/s, output: 1382.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:42<00:10, 94.27it/s, est. speed input: 3535.04 toks/s, output: 1519.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:42<00:14, 65.51it/s, est. speed input: 3657.09 toks/s, output: 1569.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:42<00:16, 57.85it/s, est. speed input: 3749.51 toks/s, output: 1610.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:42<00:15, 59.85it/s, est. speed input: 3844.77 toks/s, output: 1669.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:43<00:15, 57.61it/s, est. speed input: 3932.20 toks/s, output: 1717.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:10, 83.95it/s, est. speed input: 4183.00 toks/s, output: 1837.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:12, 70.76it/s, est. speed input: 4274.90 toks/s, output: 1882.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:12, 71.99it/s, est. speed input: 4371.97 toks/s, output: 1939.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:43<00:12, 68.11it/s, est. speed input: 4461.64 toks/s, output: 1993.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:43<00:14, 57.08it/s, est. speed input: 4542.47 toks/s, output: 2047.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:44<00:14, 58.01it/s, est. speed input: 4633.16 toks/s, output: 2098.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:12, 67.11it/s, est. speed input: 4770.17 toks/s, output: 2180.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:44<00:16, 49.02it/s, est. speed input: 4837.49 toks/s, output: 2224.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:44<00:17, 46.74it/s, est. speed input: 4912.25 toks/s, output: 2275.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:45<00:18, 43.38it/s, est. speed input: 5027.46 toks/s, output: 2321.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:45<00:15, 51.05it/s, est. speed input: 5159.77 toks/s, output: 2395.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:45<00:11, 65.36it/s, est. speed input: 5342.97 toks/s, output: 2527.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:45<00:11, 64.52it/s, est. speed input: 5465.56 toks/s, output: 2593.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:46<00:10, 70.78it/s, est. speed input: 5596.86 toks/s, output: 2683.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:46<00:09, 71.92it/s, est. speed input: 5677.56 toks/s, output: 2740.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:08, 77.79it/s, est. speed input: 5803.99 toks/s, output: 2826.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:46<00:08, 78.51it/s, est. speed input: 5929.27 toks/s, output: 2907.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:46<00:08, 78.78it/s, est. speed input: 6016.11 toks/s, output: 2959.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:47<00:12, 51.71it/s, est. speed input: 6060.50 toks/s, output: 2984.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:47<00:10, 64.99it/s, est. speed input: 6192.75 toks/s, output: 3071.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:47<00:10, 63.02it/s, est. speed input: 6272.34 toks/s, output: 3130.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:47<00:09, 64.72it/s, est. speed input: 6360.58 toks/s, output: 3186.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:47<00:08, 70.70it/s, est. speed input: 6485.39 toks/s, output: 3270.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:47<00:08, 72.35it/s, est. speed input: 6644.67 toks/s, output: 3369.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:48<00:07, 79.79it/s, est. speed input: 6768.27 toks/s, output: 3437.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:48<00:07, 81.24it/s, est. speed input: 6849.29 toks/s, output: 3496.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:48<00:05, 93.49it/s, est. speed input: 6975.73 toks/s, output: 3577.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:48<00:06, 87.59it/s, est. speed input: 7091.60 toks/s, output: 3645.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:48<00:04, 110.20it/s, est. speed input: 7264.03 toks/s, output: 3754.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:48<00:03, 128.44it/s, est. speed input: 7477.30 toks/s, output: 3900.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:48<00:03, 121.33it/s, est. speed input: 7637.84 toks/s, output: 4005.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:49<00:03, 148.58it/s, est. speed input: 7866.58 toks/s, output: 4171.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:49<00:03, 135.61it/s, est. speed input: 8019.89 toks/s, output: 4297.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:49<00:03, 133.27it/s, est. speed input: 8144.84 toks/s, output: 4385.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:49<00:04, 89.88it/s, est. speed input: 8234.64 toks/s, output: 4477.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:03, 103.48it/s, est. speed input: 8404.42 toks/s, output: 4601.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:49<00:03, 106.35it/s, est. speed input: 8519.61 toks/s, output: 4696.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:50<00:03, 97.90it/s, est. speed input: 8629.38 toks/s, output: 4791.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:50<00:02, 118.05it/s, est. speed input: 8798.57 toks/s, output: 4934.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:50<00:02, 121.67it/s, est. speed input: 8947.93 toks/s, output: 5065.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:50<00:02, 142.38it/s, est. speed input: 9160.26 toks/s, output: 5221.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:50<00:01, 152.49it/s, est. speed input: 9325.15 toks/s, output: 5354.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:50<00:01, 129.03it/s, est. speed input: 9461.56 toks/s, output: 5484.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:51<00:02, 94.75it/s, est. speed input: 9542.26 toks/s, output: 5582.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:51<00:02, 99.58it/s, est. speed input: 9702.23 toks/s, output: 5747.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:51<00:01, 105.04it/s, est. speed input: 9819.86 toks/s, output: 5842.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:51<00:01, 111.05it/s, est. speed input: 9968.51 toks/s, output: 5976.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:51<00:01, 115.67it/s, est. speed input: 10082.76 toks/s, output: 6105.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:51<00:01, 128.38it/s, est. speed input: 10236.50 toks/s, output: 6268.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:52<00:01, 92.03it/s, est. speed input: 10315.78 toks/s, output: 6349.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:52<00:01, 89.19it/s, est. speed input: 10412.45 toks/s, output: 6445.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:52<00:01, 65.93it/s, est. speed input: 10464.68 toks/s, output: 6520.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:52<00:01, 69.00it/s, est. speed input: 10573.06 toks/s, output: 6636.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:52<00:01, 69.84it/s, est. speed input: 10629.49 toks/s, output: 6710.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:53<00:01, 57.80it/s, est. speed input: 10660.06 toks/s, output: 6763.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:53<00:00, 57.17it/s, est. speed input: 10713.41 toks/s, output: 6833.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:53<00:00, 59.15it/s, est. speed input: 10766.32 toks/s, output: 6900.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:53<00:00, 62.82it/s, est. speed input: 10829.04 toks/s, output: 6973.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:54<00:00, 43.34it/s, est. speed input: 10839.71 toks/s, output: 7016.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:55<00:00, 23.34it/s, est. speed input: 10734.65 toks/s, output: 6990.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:55<00:00, 17.52it/s, est. speed input: 10650.82 toks/s, output: 6970.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:07<00:00, 1.99it/s, est. speed input: 8836.09 toks/s, output: 5820.22 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:07<00:00, 19.00it/s, est. speed input: 8836.09 toks/s, output: 5820.22 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.27933862805366516, 'actor/pg_clipfrac': 0.0018814675277099013, 'actor/ppo_kl': -0.0008568750345148146}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.10947705805301666, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005006292485632002}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.11165685206651688, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000653884606435895}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.5757166147232056, 'actor/pg_clipfrac': 0.0010449320543557405, 'actor/ppo_kl': -0.0002760114730335772}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.16029709577560425, 'actor/pg_clipfrac': 0.002664298517629504, 'actor/ppo_kl': -0.0012555758003145456}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.22834235429763794, 'actor/pg_clipfrac': 0.002606429159641266, 'actor/ppo_kl': -0.0011186674237251282}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.2765398919582367, 'actor/pg_clipfrac': 0.0009319664677605033, 'actor/ppo_kl': -0.0002574085083324462}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3924393951892853, 'actor/pg_clipfrac': 0.0033738191705197096, 'actor/ppo_kl': -0.0008773456211201847}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00026559553225524724, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00023756560403853655}
[36m(Runner pid=3309020)[0m Step 52
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.276
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.026
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.005
[36m(Runner pid=3309020)[0m ppo_kl: -6.33817748237675e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.007
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.007
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.676
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.676
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 987948
[36m(Runner pid=3309020)[0m balanced_min: 987947
[36m(Runner pid=3309020)[0m max: 992483
[36m(Runner pid=3309020)[0m mean: 987947.5
[36m(Runner pid=3309020)[0m min: 983412
[36m(Runner pid=3309020)[0m minmax_diff: 9071
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.892
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.121
[36m(Runner pid=3309020)[0m throughput: 1153.428
[36m(Runner pid=3309020)[0m time_per_step: 856.531
[36m(Runner pid=3309020)[0m total_num_tokens: 1975895
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 572.0
[36m(Runner pid=3309020)[0m mean: 463.529
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1283.0
[36m(Runner pid=3309020)[0m mean: 308.305
[36m(Runner pid=3309020)[0m min: 57.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.352
[36m(Runner pid=3309020)[0m format: 0.999
[36m(Runner pid=3309020)[0m overall: 0.676
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.133
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.286
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.222
[36m(Runner pid=3309020)[0m gen: 104.911
[36m(Runner pid=3309020)[0m old: 90.023
[36m(Runner pid=3309020)[0m ref: 89.787
[36m(Runner pid=3309020)[0m reward: 6.422
[36m(Runner pid=3309020)[0m step: 856.531
[36m(Runner pid=3309020)[0m update_actor: 564.466
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 53; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:28:29 [executor_base.py:219] It took 0.378225 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.71 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:30:10 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:28:29 [executor_base.py:219] It took 0.330454 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:30:11 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.79 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:30:11 [executor_base.py:208] It took 0.329022 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.79 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:30:33 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:30:34 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:30:34 [executor_base.py:208] It took 0.327566 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.5081305503845215, 'actor/pg_clipfrac': 0.004201680887490511, 'actor/ppo_kl': 5.266045263851993e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00043169656419195235, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3099932372570038, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005118543049320579}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0005851043970324099, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002915624645538628, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4616340398788452, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0027744511608034372}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00039393699262291193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.049332473427057266, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.13194972276687622, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.44702091813087463, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003023532044608146, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015226565301418304}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0004315243277233094, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001100077759474516}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.306730717420578, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.24191312491893768, 'actor/pg_clipfrac': 0.0008417508215643466, 'actor/ppo_kl': 0.001624450902454555}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.04797346144914627, 'actor/pg_clipfrac': 0.0005461496184580028, 'actor/ppo_kl': 0.00046230130828917027}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.003135447623208165, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.36278077960014343, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001239291625097394}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2545264959335327, 'actor/pg_clipfrac': 0.0020120723638683558, 'actor/ppo_kl': -0.000982501427643001}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0851082056760788, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00039162361645139754}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4780920743942261, 'actor/pg_clipfrac': 0.0020920501556247473, 'actor/ppo_kl': -0.0003990947443526238}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.7126699090003967, 'actor/pg_clipfrac': 0.0031380753498524427, 'actor/ppo_kl': 0.0012465940089896321}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.6997683644294739, 'actor/pg_clipfrac': 0.0014347202377393842, 'actor/ppo_kl': -0.0001382184709655121}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.5184470415115356, 'actor/pg_clipfrac': 0.003355704713612795, 'actor/ppo_kl': 0.0005723944632336497}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.20783619582653046, 'actor/pg_clipfrac': 0.0024003840517252684, 'actor/ppo_kl': 0.0003935685381293297}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5421916842460632, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 2.659881647559814e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00029525041463784873, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00017872723401524127}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00025837321300059557, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012842139694839716}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.47073498368263245, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015242717927321792}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00032568155438639224, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006351626943796873}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0005660116439685225, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000637065910268575}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.32159292697906494, 'actor/pg_clipfrac': 0.0015396458329632878, 'actor/ppo_kl': 0.0012853400548920035}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00027743136161006987, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007246489985845983}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0003033750399481505, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001374203129671514}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002197747671743855, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002976087562274188}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002682617341633886, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014689400559291244}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.5267817974090576, 'actor/pg_clipfrac': 0.0010729613713920116, 'actor/ppo_kl': 2.3931905161589384e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.2332243025302887, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003456060658209026}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.5038843750953674, 'actor/pg_clipfrac': 0.0037064491771161556, 'actor/ppo_kl': -0.0017413739114999771}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.20881950855255127, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008418374345637858}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.02936173975467682, 'actor/pg_clipfrac': 0.0012812300119549036, 'actor/ppo_kl': -0.0002669493842404336}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.5424244403839111, 'actor/pg_clipfrac': 0.0028922632336616516, 'actor/ppo_kl': -0.0001233915245393291}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.000254357437370345, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013462294591590762}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00021897585247643292, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000536188657861203}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.7878336906433105, 'actor/pg_clipfrac': 0.0022446690127253532, 'actor/ppo_kl': 0.0010265560122206807}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.46230238676071167, 'actor/pg_clipfrac': 0.0030731407459825277, 'actor/ppo_kl': 0.0006280053639784455}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0005733471480198205, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001245455932803452}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0003296258219052106, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.004303014371544123}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:29<2:06:45, 5.97s/it, est. speed input: 73.76 toks/s, output: 22.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:31<57:21, 2.71s/it, est. speed input: 139.93 toks/s, output: 42.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:34<36:45, 1.74s/it, est. speed input: 194.16 toks/s, output: 59.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:35<23:14, 1.11s/it, est. speed input: 253.10 toks/s, output: 83.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:35<15:03, 1.39it/s, est. speed input: 311.96 toks/s, output: 105.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:36<10:15, 2.03it/s, est. speed input: 371.23 toks/s, output: 125.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:36<07:39, 2.71it/s, est. speed input: 426.26 toks/s, output: 142.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:36<05:36, 3.68it/s, est. speed input: 485.58 toks/s, output: 165.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:37<04:38, 4.43it/s, est. speed input: 536.14 toks/s, output: 185.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:37<03:29, 5.87it/s, est. speed input: 592.80 toks/s, output: 203.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:38<02:21, 8.64it/s, est. speed input: 700.65 toks/s, output: 246.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:38<02:08, 9.45it/s, est. speed input: 753.95 toks/s, output: 270.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:38<01:07, 17.78it/s, est. speed input: 930.59 toks/s, output: 337.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:39<00:45, 25.77it/s, est. speed input: 1094.71 toks/s, output: 409.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:39<00:40, 29.37it/s, est. speed input: 1200.09 toks/s, output: 453.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:39<00:32, 35.25it/s, est. speed input: 1426.28 toks/s, output: 547.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:40<00:34, 33.35it/s, est. speed input: 1531.11 toks/s, output: 588.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:40<00:29, 38.72it/s, est. speed input: 1636.86 toks/s, output: 631.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:40<00:27, 40.84it/s, est. speed input: 1738.14 toks/s, output: 682.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:40<00:20, 53.01it/s, est. speed input: 1897.97 toks/s, output: 753.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:41<00:28, 38.19it/s, est. speed input: 1984.61 toks/s, output: 797.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:41<00:26, 40.78it/s, est. speed input: 2091.15 toks/s, output: 846.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:41<00:32, 32.91it/s, est. speed input: 2184.50 toks/s, output: 881.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:42<00:52, 20.34it/s, est. speed input: 2203.26 toks/s, output: 891.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:42<00:41, 25.63it/s, est. speed input: 2297.49 toks/s, output: 940.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:42<00:34, 30.98it/s, est. speed input: 2394.80 toks/s, output: 997.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:42<00:27, 38.59it/s, est. speed input: 2497.29 toks/s, output: 1043.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:43<00:33, 30.79it/s, est. speed input: 2620.90 toks/s, output: 1103.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:43<00:28, 35.33it/s, est. speed input: 2717.76 toks/s, output: 1150.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:44<00:37, 26.92it/s, est. speed input: 2785.75 toks/s, output: 1195.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:44<00:36, 27.68it/s, est. speed input: 2828.99 toks/s, output: 1215.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:44<00:33, 30.16it/s, est. speed input: 2876.62 toks/s, output: 1238.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:44<00:32, 30.63it/s, est. speed input: 2920.53 toks/s, output: 1258.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:44<00:27, 35.86it/s, est. speed input: 3008.52 toks/s, output: 1302.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:45<00:27, 35.25it/s, est. speed input: 3048.19 toks/s, output: 1326.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:45<00:22, 42.89it/s, est. speed input: 3138.70 toks/s, output: 1369.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:45<00:23, 40.28it/s, est. speed input: 3181.22 toks/s, output: 1389.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:45<00:18, 51.92it/s, est. speed input: 3316.73 toks/s, output: 1454.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:45<00:20, 44.80it/s, est. speed input: 3397.82 toks/s, output: 1496.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:46<00:21, 44.00it/s, est. speed input: 3486.85 toks/s, output: 1535.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:46<00:21, 42.68it/s, est. speed input: 3528.86 toks/s, output: 1561.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:46<00:19, 46.71it/s, est. speed input: 3617.75 toks/s, output: 1611.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:46<00:18, 49.83it/s, est. speed input: 3701.61 toks/s, output: 1670.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:46<00:18, 48.78it/s, est. speed input: 3787.07 toks/s, output: 1699.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:47<00:18, 47.43it/s, est. speed input: 3907.98 toks/s, output: 1772.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:47<00:14, 57.74it/s, est. speed input: 4046.17 toks/s, output: 1856.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:47<00:11, 74.29it/s, est. speed input: 4222.32 toks/s, output: 1957.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:47<00:07, 111.20it/s, est. speed input: 4509.13 toks/s, output: 2134.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:47<00:06, 124.62it/s, est. speed input: 4736.24 toks/s, output: 2232.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:47<00:07, 99.26it/s, est. speed input: 4855.02 toks/s, output: 2296.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:48<00:10, 75.74it/s, est. speed input: 4961.98 toks/s, output: 2369.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:48<00:09, 75.66it/s, est. speed input: 5042.16 toks/s, output: 2420.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:48<00:09, 76.21it/s, est. speed input: 5177.02 toks/s, output: 2500.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:48<00:09, 76.70it/s, est. speed input: 5257.68 toks/s, output: 2550.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:48<00:08, 86.59it/s, est. speed input: 5387.55 toks/s, output: 2642.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:48<00:06, 105.18it/s, est. speed input: 5559.62 toks/s, output: 2738.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:49<00:11, 57.02it/s, est. speed input: 5640.79 toks/s, output: 2813.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:49<00:10, 64.83it/s, est. speed input: 5801.40 toks/s, output: 2931.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:49<00:09, 67.44it/s, est. speed input: 5873.49 toks/s, output: 2996.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:50<00:07, 79.67it/s, est. speed input: 6044.42 toks/s, output: 3122.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:50<00:06, 92.95it/s, est. speed input: 6206.10 toks/s, output: 3248.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:50<00:04, 117.60it/s, est. speed input: 6419.79 toks/s, output: 3389.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:50<00:05, 106.56it/s, est. speed input: 6529.36 toks/s, output: 3473.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:50<00:04, 122.36it/s, est. speed input: 6703.98 toks/s, output: 3579.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:50<00:04, 120.88it/s, est. speed input: 6830.14 toks/s, output: 3667.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:50<00:04, 110.95it/s, est. speed input: 6953.02 toks/s, output: 3770.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:51<00:03, 139.48it/s, est. speed input: 7172.95 toks/s, output: 3901.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:51<00:03, 133.20it/s, est. speed input: 7331.14 toks/s, output: 4026.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:51<00:03, 125.50it/s, est. speed input: 7444.31 toks/s, output: 4126.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:51<00:03, 125.60it/s, est. speed input: 7567.68 toks/s, output: 4223.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:51<00:02, 153.30it/s, est. speed input: 7813.07 toks/s, output: 4419.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:51<00:02, 180.54it/s, est. speed input: 8077.60 toks/s, output: 4617.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:51<00:02, 137.09it/s, est. speed input: 8217.55 toks/s, output: 4730.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:52<00:02, 119.83it/s, est. speed input: 8361.44 toks/s, output: 4823.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:52<00:02, 120.45it/s, est. speed input: 8480.69 toks/s, output: 4939.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:52<00:02, 115.58it/s, est. speed input: 8583.99 toks/s, output: 5036.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:52<00:02, 119.95it/s, est. speed input: 8739.40 toks/s, output: 5177.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:52<00:02, 116.89it/s, est. speed input: 8847.38 toks/s, output: 5262.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:52<00:02, 112.97it/s, est. speed input: 8989.65 toks/s, output: 5387.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:53<00:02, 90.35it/s, est. speed input: 9073.38 toks/s, output: 5462.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:53<00:02, 86.25it/s, est. speed input: 9138.16 toks/s, output: 5531.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:53<00:02, 87.06it/s, est. speed input: 9216.58 toks/s, output: 5602.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:53<00:02, 86.27it/s, est. speed input: 9285.03 toks/s, output: 5684.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:53<00:02, 83.58it/s, est. speed input: 9353.07 toks/s, output: 5752.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:53<00:02, 67.68it/s, est. speed input: 9402.79 toks/s, output: 5789.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:54<00:02, 84.38it/s, est. speed input: 9541.74 toks/s, output: 5922.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:54<00:01, 109.81it/s, est. speed input: 9799.11 toks/s, output: 6167.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:54<00:01, 102.30it/s, est. speed input: 9892.53 toks/s, output: 6271.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:54<00:00, 108.05it/s, est. speed input: 10000.06 toks/s, output: 6380.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:54<00:01, 81.33it/s, est. speed input: 10070.28 toks/s, output: 6479.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:55<00:00, 82.71it/s, est. speed input: 10162.23 toks/s, output: 6568.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:55<00:00, 81.23it/s, est. speed input: 10256.02 toks/s, output: 6669.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:55<00:00, 58.96it/s, est. speed input: 10282.35 toks/s, output: 6731.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:55<00:00, 71.74it/s, est. speed input: 10387.65 toks/s, output: 6857.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:55<00:00, 71.01it/s, est. speed input: 10444.98 toks/s, output: 6933.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:56<00:00, 44.46it/s, est. speed input: 10439.68 toks/s, output: 6968.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:57<00:00, 28.03it/s, est. speed input: 10381.39 toks/s, output: 6961.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:58<00:00, 15.47it/s, est. speed input: 10222.83 toks/s, output: 6887.09 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:58<00:00, 22.00it/s, est. speed input: 10222.83 toks/s, output: 6887.09 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.11656638979911804, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.5348707165685482e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.08423472195863724, 'actor/pg_clipfrac': 0.00131319765932858, 'actor/ppo_kl': 0.0003190970455761999}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2614913284778595, 'actor/pg_clipfrac': 0.0009208103292621672, 'actor/ppo_kl': -1.7991602362599224e-05}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.13846005499362946, 'actor/pg_clipfrac': 0.0005938242538832128, 'actor/ppo_kl': -0.00022082770010456443}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.2692703306674957, 'actor/pg_clipfrac': 0.0005892752087675035, 'actor/ppo_kl': 0.0011036130599677563}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.12730816006660461, 'actor/pg_clipfrac': 0.0009033423848450184, 'actor/ppo_kl': 0.0005443352274596691}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.15375623106956482, 'actor/pg_clipfrac': 0.0009451796067878604, 'actor/ppo_kl': -0.0004551261954475194}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.08729665726423264, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015663506928831339}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.5652757883071899, 'actor/pg_clipfrac': 0.0012755101779475808, 'actor/ppo_kl': -0.00039807386929169297}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3377363681793213, 'actor/pg_clipfrac': 0.004085802007466555, 'actor/ppo_kl': -0.002164378995075822}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.01624009944498539, 'actor/pg_clipfrac': 0.0007892659632489085, 'actor/ppo_kl': 0.000143112862133421}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00034544954542070627, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013021151535212994}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.7045152187347412, 'actor/pg_clipfrac': 0.001287001301534474, 'actor/ppo_kl': -8.820935181574896e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.2461671233177185, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00017112864588852972}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.7487364411354065, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007145263371057808}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6278722882270813, 'actor/pg_clipfrac': 0.0020408162381500006, 'actor/ppo_kl': -0.001972910715267062}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00048415048513561487, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -9.76511073531583e-05}
[36m(Runner pid=3309020)[0m Step 53
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.22
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.031
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.021
[36m(Runner pid=3309020)[0m ppo_kl: 7.410084893848534e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.039
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.039
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.676
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.676
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 994334
[36m(Runner pid=3309020)[0m balanced_min: 994334
[36m(Runner pid=3309020)[0m max: 1001251
[36m(Runner pid=3309020)[0m mean: 994334.0
[36m(Runner pid=3309020)[0m min: 987417
[36m(Runner pid=3309020)[0m minmax_diff: 13834
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 107.904
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.122
[36m(Runner pid=3309020)[0m throughput: 1122.755
[36m(Runner pid=3309020)[0m time_per_step: 885.619
[36m(Runner pid=3309020)[0m total_num_tokens: 1988668
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 632.0
[36m(Runner pid=3309020)[0m mean: 465.549
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 5500.0
[36m(Runner pid=3309020)[0m mean: 311.275
[36m(Runner pid=3309020)[0m min: 80.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.354
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.676
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.104945671907103e-05
[36m(Runner pid=3309020)[0m gen: 0.175
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.283
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.141
[36m(Runner pid=3309020)[0m gen: 139.552
[36m(Runner pid=3309020)[0m old: 87.822
[36m(Runner pid=3309020)[0m ref: 87.527
[36m(Runner pid=3309020)[0m reward: 6.582
[36m(Runner pid=3309020)[0m step: 885.619
[36m(Runner pid=3309020)[0m update_actor: 563.354
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 54; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:43:20 [executor_base.py:219] It took 0.338479 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:44:49 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:43:20 [executor_base.py:219] It took 0.340003 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:44:49 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:44:49 [executor_base.py:208] It took 0.325667 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:45:17 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:45:18 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:45:18 [executor_base.py:208] It took 0.327796 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.32458972930908203, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013170834863558412}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00029247364727780223, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.1481339931488037, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.38681015372276306, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.7079448699951172, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0002863822446670383, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.18148641288280487, 'actor/pg_clipfrac': 0.001719690510071814, 'actor/ppo_kl': 0.0008208716753870249}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.7267612814903259, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00018404604634270072, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00031173494062386453, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003423818852752447, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.08448746055364609, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.11949602514505386, 'actor/pg_clipfrac': 0.0005662514013238251, 'actor/ppo_kl': 0.0008003897964954376}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.23063234984874725, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00040634654578752816, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012887556804344058}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3824997842311859, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.310251921415329, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005194525583647192}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.12250862270593643, 'actor/pg_clipfrac': 0.0007342143799178302, 'actor/ppo_kl': 0.000836051709484309}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.17615669965744019, 'actor/pg_clipfrac': 0.00570776266977191, 'actor/ppo_kl': 0.001241808058694005}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.2074587196111679, 'actor/pg_clipfrac': 0.0005889281746931374, 'actor/ppo_kl': -0.00042295962339267135}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00027767143910750747, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013159709051251411}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.08413166552782059, 'actor/pg_clipfrac': 0.0007942811935208738, 'actor/ppo_kl': 0.0013525980757549405}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2525034546852112, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0026724848430603743}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0642884373664856, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00027258734917268157}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.32314667105674744, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005519137484952807}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.25665342807769775, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000969761807937175}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00030134565895423293, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011206783819943666}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5762042999267578, 'actor/pg_clipfrac': 0.0007627765298821032, 'actor/ppo_kl': -0.0006262024980969727}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.5237528085708618, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002770028659142554}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003626446414273232, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00173261109739542}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00022365314362104982, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005756759201176465}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.26460859179496765, 'actor/pg_clipfrac': 0.0018814675277099013, 'actor/ppo_kl': -0.0007196858641691506}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0003896575653925538, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002600467298179865}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5848034620285034, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013780955923721194}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.08299290388822556, 'actor/pg_clipfrac': 0.002871500328183174, 'actor/ppo_kl': -6.400777056114748e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00045699015026912093, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009059719741344452}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.7421176433563232, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002844425616785884}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003880687290802598, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000289121235255152}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00030846334993839264, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.1178690328961238e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00037405695184133947, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -8.518183312844485e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.24317620694637299, 'actor/pg_clipfrac': 0.0012383901048451662, 'actor/ppo_kl': 0.0011294811265543103}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.20026619732379913, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000849034171551466}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.24614302814006805, 'actor/pg_clipfrac': 0.0021598271559923887, 'actor/ppo_kl': -0.0007170847966335714}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.08291491121053696, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002008485607802868}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.11877437680959702, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005981152644380927}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.14140871167182922, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001091928337700665}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.34934496879577637, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011948593892157078}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4965924918651581, 'actor/pg_clipfrac': 0.0021321962121874094, 'actor/ppo_kl': 0.0007499275961890817}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00019957155745942146, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011839749058708549}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.4909643232822418, 'actor/pg_clipfrac': 0.0032102728728204966, 'actor/ppo_kl': 0.0007649417384527624}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -1.2198106050491333, 'actor/pg_clipfrac': 0.004761904943734407, 'actor/ppo_kl': -0.0021569584496319294}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.23253504931926727, 'actor/pg_clipfrac': 0.001095290295779705, 'actor/ppo_kl': 0.000533959420863539}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.17755700647830963, 'actor/pg_clipfrac': 0.0012195121962577105, 'actor/ppo_kl': 0.0012428946793079376}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.6502472758293152, 'actor/pg_clipfrac': 0.0032102728728204966, 'actor/ppo_kl': 0.0018555853748694062}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.30270931124687195, 'actor/pg_clipfrac': 0.002799160312861204, 'actor/ppo_kl': -0.0007828938541933894}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.16193899512290955, 'actor/pg_clipfrac': 0.0009372071363031864, 'actor/ppo_kl': 0.002303612418472767}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.05081449821591377, 'actor/pg_clipfrac': 0.004667444620281458, 'actor/ppo_kl': 0.0006526270299218595}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0004904762608930469, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.5657456970075145e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.4924697279930115, 'actor/pg_clipfrac': 0.001055966247804463, 'actor/ppo_kl': -0.000671388756018132}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.24650034308433533, 'actor/pg_clipfrac': 0.0009727626456879079, 'actor/ppo_kl': -0.0007021789206191897}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.11228843033313751, 'actor/pg_clipfrac': 0.0011709601385518909, 'actor/ppo_kl': 0.0008059620158746839}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.6630303859710693, 'actor/pg_clipfrac': 0.002217294881120324, 'actor/ppo_kl': 0.00020431890152394772}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.1492103487253189, 'actor/pg_clipfrac': 0.0022744503803551197, 'actor/ppo_kl': 0.0005455928039737046}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0004273975791875273, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014695741701871157}
[36m(Runner pid=3309020)[0m Step 54
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.264
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.046
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.018
[36m(Runner pid=3309020)[0m ppo_kl: 8.290299395508072e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.029
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.029
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.678
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.678
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1009839
[36m(Runner pid=3309020)[0m balanced_min: 1006756
[36m(Runner pid=3309020)[0m max: 1021147
[36m(Runner pid=3309020)[0m mean: 1008297.5
[36m(Runner pid=3309020)[0m min: 995448
[36m(Runner pid=3309020)[0m minmax_diff: 25699
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.582
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.124
[36m(Runner pid=3309020)[0m throughput: 1145.234
[36m(Runner pid=3309020)[0m time_per_step: 880.429
[36m(Runner pid=3309020)[0m total_num_tokens: 2016595
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 687.0
[36m(Runner pid=3309020)[0m mean: 466.775
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 4841.0
[36m(Runner pid=3309020)[0m mean: 320.957
[36m(Runner pid=3309020)[0m min: 46.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.357
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.678
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.347764720877517e-05
[36m(Runner pid=3309020)[0m gen: 0.162
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.28
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.148
[36m(Runner pid=3309020)[0m gen: 132.79
[36m(Runner pid=3309020)[0m old: 87.928
[36m(Runner pid=3309020)[0m ref: 87.8
[36m(Runner pid=3309020)[0m reward: 6.362
[36m(Runner pid=3309020)[0m step: 880.429
[36m(Runner pid=3309020)[0m update_actor: 564.758
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 55; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:58:03 [executor_base.py:219] It took 0.341650 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:34:04, 4.43s/it, est. speed input: 100.75 toks/s, output: 27.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<41:54, 1.98s/it, est. speed input: 193.26 toks/s, output: 51.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:23<23:24, 1.11s/it, est. speed input: 288.96 toks/s, output: 74.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<22:00, 1.05s/it, est. speed input: 319.65 toks/s, output: 87.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<14:23, 1.45it/s, est. speed input: 394.19 toks/s, output: 113.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:29<10:00, 2.08it/s, est. speed input: 468.47 toks/s, output: 134.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<07:55, 2.62it/s, est. speed input: 526.15 toks/s, output: 152.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<06:40, 3.09it/s, est. speed input: 582.76 toks/s, output: 171.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:31<05:04, 4.05it/s, est. speed input: 648.62 toks/s, output: 193.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:31<03:59, 5.13it/s, est. speed input: 712.30 toks/s, output: 215.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:33<04:38, 4.40it/s, est. speed input: 748.45 toks/s, output: 229.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:34<04:36, 4.41it/s, est. speed input: 790.36 toks/s, output: 243.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<03:56, 5.14it/s, est. speed input: 839.85 toks/s, output: 261.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<03:25, 5.90it/s, est. speed input: 891.70 toks/s, output: 284.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<03:02, 6.62it/s, est. speed input: 940.69 toks/s, output: 305.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:36<01:45, 11.32it/s, est. speed input: 1058.94 toks/s, output: 352.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:36<00:49, 23.57it/s, est. speed input: 1307.36 toks/s, output: 440.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:36<00:45, 25.67it/s, est. speed input: 1420.26 toks/s, output: 483.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:37<00:46, 24.92it/s, est. speed input: 1471.50 toks/s, output: 503.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<00:50, 22.67it/s, est. speed input: 1522.98 toks/s, output: 523.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:37<00:47, 24.37it/s, est. speed input: 1575.61 toks/s, output: 541.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:47, 23.86it/s, est. speed input: 1630.63 toks/s, output: 564.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:56, 20.22it/s, est. speed input: 1678.52 toks/s, output: 586.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:38<00:49, 22.95it/s, est. speed input: 1787.79 toks/s, output: 625.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:38<00:39, 28.44it/s, est. speed input: 1898.58 toks/s, output: 675.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:39<00:35, 31.35it/s, est. speed input: 2054.48 toks/s, output: 737.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:34, 32.12it/s, est. speed input: 2107.79 toks/s, output: 761.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:30, 36.21it/s, est. speed input: 2217.31 toks/s, output: 808.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:24, 43.34it/s, est. speed input: 2326.94 toks/s, output: 859.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:39<00:28, 38.07it/s, est. speed input: 2369.74 toks/s, output: 882.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:39<00:28, 37.88it/s, est. speed input: 2420.12 toks/s, output: 901.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:44, 23.92it/s, est. speed input: 2450.84 toks/s, output: 915.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:40<00:34, 30.91it/s, est. speed input: 2555.17 toks/s, output: 968.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:37, 28.00it/s, est. speed input: 2595.18 toks/s, output: 993.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:40<00:27, 37.74it/s, est. speed input: 2698.30 toks/s, output: 1047.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:41<00:32, 32.26it/s, est. speed input: 2739.42 toks/s, output: 1069.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:41<00:30, 34.31it/s, est. speed input: 2789.39 toks/s, output: 1091.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:41<00:27, 37.41it/s, est. speed input: 2886.45 toks/s, output: 1140.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:26, 38.54it/s, est. speed input: 2932.33 toks/s, output: 1169.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:20, 49.33it/s, est. speed input: 3036.46 toks/s, output: 1223.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:42<00:33, 30.09it/s, est. speed input: 3108.76 toks/s, output: 1258.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:42<00:25, 38.73it/s, est. speed input: 3208.92 toks/s, output: 1315.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:42<00:22, 44.30it/s, est. speed input: 3307.01 toks/s, output: 1373.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:42<00:15, 61.82it/s, est. speed input: 3466.03 toks/s, output: 1442.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:42<00:13, 71.17it/s, est. speed input: 3616.31 toks/s, output: 1500.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:43<00:14, 63.48it/s, est. speed input: 3711.23 toks/s, output: 1551.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:43<00:10, 89.66it/s, est. speed input: 3965.84 toks/s, output: 1670.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:43<00:10, 84.09it/s, est. speed input: 4113.78 toks/s, output: 1752.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:10, 80.86it/s, est. speed input: 4253.02 toks/s, output: 1830.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:12, 71.21it/s, est. speed input: 4338.93 toks/s, output: 1884.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:44<00:18, 47.26it/s, est. speed input: 4403.46 toks/s, output: 1918.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:44<00:18, 45.65it/s, est. speed input: 4486.51 toks/s, output: 1969.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:44<00:19, 43.04it/s, est. speed input: 4565.12 toks/s, output: 2022.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:14, 55.76it/s, est. speed input: 4711.77 toks/s, output: 2096.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:45<00:13, 60.02it/s, est. speed input: 4798.82 toks/s, output: 2134.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:45<00:13, 59.67it/s, est. speed input: 4880.77 toks/s, output: 2178.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:45<00:14, 55.77it/s, est. speed input: 4957.45 toks/s, output: 2235.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:45<00:17, 45.06it/s, est. speed input: 5022.32 toks/s, output: 2273.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:45<00:15, 50.42it/s, est. speed input: 5151.40 toks/s, output: 2353.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:46<00:13, 56.29it/s, est. speed input: 5238.98 toks/s, output: 2413.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:46<00:12, 61.76it/s, est. speed input: 5322.40 toks/s, output: 2451.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:46<00:09, 75.99it/s, est. speed input: 5454.10 toks/s, output: 2545.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:46<00:09, 77.94it/s, est. speed input: 5541.97 toks/s, output: 2601.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:46<00:06, 104.24it/s, est. speed input: 5718.42 toks/s, output: 2695.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:46<00:06, 105.29it/s, est. speed input: 5855.57 toks/s, output: 2763.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:46<00:07, 92.52it/s, est. speed input: 5982.32 toks/s, output: 2855.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:47<00:06, 97.63it/s, est. speed input: 6113.32 toks/s, output: 2943.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:47<00:07, 85.18it/s, est. speed input: 6225.58 toks/s, output: 3012.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:47<00:09, 67.32it/s, est. speed input: 6286.23 toks/s, output: 3042.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:47<00:08, 74.66it/s, est. speed input: 6410.49 toks/s, output: 3120.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:47<00:08, 68.31it/s, est. speed input: 6481.30 toks/s, output: 3166.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:48<00:09, 64.14it/s, est. speed input: 6554.41 toks/s, output: 3218.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:48<00:08, 68.40it/s, est. speed input: 6630.48 toks/s, output: 3279.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:48<00:08, 72.17it/s, est. speed input: 6714.58 toks/s, output: 3343.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:48<00:08, 71.02it/s, est. speed input: 6788.84 toks/s, output: 3385.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:48<00:05, 96.17it/s, est. speed input: 6965.08 toks/s, output: 3496.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:48<00:05, 93.69it/s, est. speed input: 7088.36 toks/s, output: 3592.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:48<00:06, 85.81it/s, est. speed input: 7155.57 toks/s, output: 3652.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:48<00:06, 84.74it/s, est. speed input: 7234.77 toks/s, output: 3721.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:49<00:03, 132.80it/s, est. speed input: 7542.11 toks/s, output: 3964.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:49<00:04, 93.47it/s, est. speed input: 7634.56 toks/s, output: 4047.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:49<00:05, 88.44it/s, est. speed input: 7744.37 toks/s, output: 4120.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:49<00:04, 89.71it/s, est. speed input: 7816.88 toks/s, output: 4168.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:50<00:07, 57.50it/s, est. speed input: 7849.17 toks/s, output: 4209.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:50<00:06, 62.35it/s, est. speed input: 7954.08 toks/s, output: 4301.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:50<00:05, 74.25it/s, est. speed input: 8071.35 toks/s, output: 4430.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:50<00:04, 82.40it/s, est. speed input: 8186.09 toks/s, output: 4520.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:50<00:04, 75.35it/s, est. speed input: 8247.19 toks/s, output: 4577.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:50<00:04, 73.00it/s, est. speed input: 8314.65 toks/s, output: 4647.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:51<00:04, 73.07it/s, est. speed input: 8380.20 toks/s, output: 4699.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:51<00:05, 60.95it/s, est. speed input: 8431.46 toks/s, output: 4759.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:51<00:04, 75.11it/s, est. speed input: 8552.98 toks/s, output: 4862.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:51<00:02, 115.20it/s, est. speed input: 8806.62 toks/s, output: 5077.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:51<00:02, 99.81it/s, est. speed input: 8908.01 toks/s, output: 5147.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:52<00:03, 70.44it/s, est. speed input: 8975.85 toks/s, output: 5209.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:52<00:02, 86.47it/s, est. speed input: 9145.99 toks/s, output: 5364.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:52<00:02, 89.48it/s, est. speed input: 9249.11 toks/s, output: 5464.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:52<00:01, 117.08it/s, est. speed input: 9444.55 toks/s, output: 5654.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:52<00:01, 113.81it/s, est. speed input: 9552.79 toks/s, output: 5762.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:52<00:01, 105.69it/s, est. speed input: 9658.75 toks/s, output: 5865.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:52<00:01, 112.84it/s, est. speed input: 9778.13 toks/s, output: 5989.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:53<00:01, 90.00it/s, est. speed input: 9866.48 toks/s, output: 6087.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:53<00:01, 88.78it/s, est. speed input: 9960.64 toks/s, output: 6167.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:53<00:01, 87.55it/s, est. speed input: 10020.21 toks/s, output: 6258.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:53<00:01, 82.21it/s, est. speed input: 10107.04 toks/s, output: 6371.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:54<00:01, 57.29it/s, est. speed input: 10124.30 toks/s, output: 6418.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:54<00:01, 61.19it/s, est. speed input: 10185.56 toks/s, output: 6498.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:54<00:01, 61.50it/s, est. speed input: 10299.59 toks/s, output: 6633.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:54<00:00, 66.73it/s, est. speed input: 10390.32 toks/s, output: 6779.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:55<00:01, 34.41it/s, est. speed input: 10331.64 toks/s, output: 6789.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:55<00:00, 38.60it/s, est. speed input: 10403.10 toks/s, output: 6917.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:55<00:00, 38.44it/s, est. speed input: 10436.37 toks/s, output: 6999.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:56<00:00, 27.88it/s, est. speed input: 10392.10 toks/s, output: 6997.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:56<00:00, 27.31it/s, est. speed input: 10399.32 toks/s, output: 7050.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.49it/s, est. speed input: 10427.93 toks/s, output: 7096.95 toks/s]
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:59:31 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:58:03 [executor_base.py:219] It took 0.341899 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:59:31 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 10:59:31 [executor_base.py:208] It took 0.324995 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:59:40 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:59:40 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 10:59:40 [executor_base.py:208] It took 0.329267 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0004992065951228142, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015694815665483475}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0003103076887782663, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0003459038562141359, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012158508179709315}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0005613437388092279, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015440239803865552}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3926284611225128, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00023773641441948712, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001367518270853907}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.3540615439414978, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.15979820489883423, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011348569532856345}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.3344387412071228, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 8.58421772136353e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00047748637734912336, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003175186284352094, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006104045314714313}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.628720760345459, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00014804753300268203, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001730578369461}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00040849403012543917, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.3684746026992798, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005287639214657247}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4151090979576111, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.2681373655796051, 'actor/pg_clipfrac': 0.0030456853564828634, 'actor/ppo_kl': -0.00030245320522226393}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.11623553186655045, 'actor/pg_clipfrac': 0.001612903201021254, 'actor/ppo_kl': -0.0002563015150371939}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00033907665056176484, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.003228608751669526}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.5560184717178345, 'actor/pg_clipfrac': 0.0037629350554198027, 'actor/ppo_kl': -0.0005772555596195161}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.07822173088788986, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00014042302791494876}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.3412061631679535, 'actor/pg_clipfrac': 0.0020120723638683558, 'actor/ppo_kl': 0.00082549819489941}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.12768064439296722, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007715461542829871}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00038328106165863574, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010901144705712795}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0003818502591457218, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00046137539902701974}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.05757361650466919, 'actor/pg_clipfrac': 0.0006635700119659305, 'actor/ppo_kl': 0.0004880471096839756}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.24009168148040771, 'actor/pg_clipfrac': 0.0014936520019546151, 'actor/ppo_kl': -8.861692913342267e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.0938817709684372, 'actor/pg_clipfrac': 0.001004016026854515, 'actor/ppo_kl': 0.0020107533782720566}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002742055803537369, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000578027858864516}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.3507310748100281, 'actor/pg_clipfrac': 0.0025723471771925688, 'actor/ppo_kl': -0.0002754579472821206}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.30364125967025757, 'actor/pg_clipfrac': 0.0010432967683300376, 'actor/ppo_kl': -0.0004599963722284883}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00018805601575877517, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.281723886379041e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00033932956284843385, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001615106244571507}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00029849170823581517, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004727540072053671}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.45763692259788513, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008139627170749009}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.29830026626586914, 'actor/pg_clipfrac': 0.003086419776082039, 'actor/ppo_kl': 0.000319960672641173}[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:36:07, 15.34s/it, est. speed input: 30.51 toks/s, output: 5.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<40:26, 6.47s/it, est. speed input: 58.58 toks/s, output: 11.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 4/377 [00:15<15:46, 2.54s/it, est. speed input: 115.32 toks/s, output: 23.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 6/377 [00:16<08:25, 1.36s/it, est. speed input: 170.94 toks/s, output: 37.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 9/377 [00:16<04:19, 1.42it/s, est. speed input: 255.84 toks/s, output: 57.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 12/377 [00:16<02:38, 2.31it/s, est. speed input: 337.09 toks/s, output: 78.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 14/377 [00:16<02:00, 3.00it/s, est. speed input: 390.12 toks/s, output: 92.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 18/377 [00:16<01:12, 4.95it/s, est. speed input: 498.25 toks/s, output: 121.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 21/377 [00:16<00:54, 6.49it/s, est. speed input: 576.21 toks/s, output: 143.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 26/377 [00:17<00:34, 10.19it/s, est. speed input: 704.00 toks/s, output: 181.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 35/377 [00:17<00:19, 17.98it/s, est. speed input: 937.57 toks/s, output: 252.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 40/377 [00:17<00:15, 21.46it/s, est. speed input: 1064.50 toks/s, output: 292.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 44/377 [00:17<00:14, 22.33it/s, est. speed input: 1159.24 toks/s, output: 324.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 49/377 [00:17<00:13, 24.64it/s, est. speed input: 1281.69 toks/s, output: 363.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 54/377 [00:17<00:11, 27.96it/s, est. speed input: 1414.34 toks/s, output: 406.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 59/377 [00:17<00:10, 30.91it/s, est. speed input: 1533.50 toks/s, output: 450.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 65/377 [00:18<00:08, 35.45it/s, est. speed input: 1678.16 toks/s, output: 503.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 72/377 [00:18<00:07, 41.28it/s, est. speed input: 1844.13 toks/s, output: 566.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 79/377 [00:18<00:06, 45.94it/s, est. speed input: 2008.02 toks/s, output: 628.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 85/377 [00:18<00:06, 44.40it/s, est. speed input: 2144.46 toks/s, output: 682.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 92/377 [00:18<00:05, 48.64it/s, est. speed input: 2305.94 toks/s, output: 747.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 98/377 [00:18<00:05, 46.54it/s, est. speed input: 2437.34 toks/s, output: 801.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 103/377 [00:18<00:06, 40.50it/s, est. speed input: 2541.05 toks/s, output: 845.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 108/377 [00:19<00:08, 33.29it/s, est. speed input: 2633.88 toks/s, output: 887.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 121/377 [00:19<00:04, 51.61it/s, est. speed input: 2934.74 toks/s, output: 1023.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 130/377 [00:19<00:04, 54.80it/s, est. speed input: 3126.37 toks/s, output: 1112.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 137/377 [00:19<00:04, 55.91it/s, est. speed input: 3277.59 toks/s, output: 1184.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 145/377 [00:19<00:04, 56.33it/s, est. speed input: 3446.77 toks/s, output: 1266.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 151/377 [00:19<00:04, 55.27it/s, est. speed input: 3568.87 toks/s, output: 1328.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 157/377 [00:19<00:04, 51.55it/s, est. speed input: 3688.23 toks/s, output: 1389.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 166/377 [00:19<00:03, 58.85it/s, est. speed input: 3878.36 toks/s, output: 1488.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 173/377 [00:20<00:04, 50.14it/s, est. speed input: 4006.19 toks/s, output: 1558.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 179/377 [00:20<00:04, 49.13it/s, est. speed input: 4120.90 toks/s, output: 1623.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 185/377 [00:20<00:04, 46.80it/s, est. speed input: 4236.71 toks/s, output: 1688.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████▏ | 194/377 [00:20<00:03, 56.75it/s, est. speed input: 4422.03 toks/s, output: 1795.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 203/377 [00:20<00:02, 59.42it/s, est. speed input: 4601.51 toks/s, output: 1902.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 212/377 [00:20<00:02, 64.14it/s, est. speed input: 4782.32 toks/s, output: 2010.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 219/377 [00:20<00:02, 62.95it/s, est. speed input: 4912.58 toks/s, output: 2093.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 226/377 [00:21<00:02, 52.36it/s, est. speed input: 5030.13 toks/s, output: 2171.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 237/377 [00:21<00:02, 61.08it/s, est. speed input: 5243.93 toks/s, output: 2311.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 246/377 [00:21<00:01, 65.82it/s, est. speed input: 5416.18 toks/s, output: 2429.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 254/377 [00:21<00:01, 66.53it/s, est. speed input: 5568.40 toks/s, output: 2533.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 261/377 [00:21<00:02, 53.71it/s, est. speed input: 5673.41 toks/s, output: 2613.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 270/377 [00:21<00:01, 61.14it/s, est. speed input: 5841.35 toks/s, output: 2737.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 277/377 [00:21<00:01, 51.76it/s, est. speed input: 5944.15 toks/s, output: 2824.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 283/377 [00:22<00:01, 51.09it/s, est. speed input: 6044.66 toks/s, output: 2904.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 289/377 [00:22<00:01, 46.76it/s, est. speed input: 6128.51 toks/s, output: 2981.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 296/377 [00:22<00:01, 51.47it/s, est. speed input: 6251.36 toks/s, output: 3085.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 302/377 [00:22<00:01, 48.45it/s, est. speed input: 6343.02 toks/s, output: 3168.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 311/377 [00:22<00:01, 56.31it/s, est. speed input: 6500.02 toks/s, output: 3309.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 317/377 [00:22<00:01, 52.76it/s, est. speed input: 6586.71 toks/s, output: 3396.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 324/377 [00:22<00:01, 47.64it/s, est. speed input: 6682.14 toks/s, output: 3497.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 330/377 [00:23<00:00, 47.36it/s, est. speed input: 6768.14 toks/s, output: 3591.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 337/377 [00:23<00:00, 48.68it/s, est. speed input: 6874.83 toks/s, output: 3705.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 342/377 [00:23<00:00, 45.46it/s, est. speed input: 6938.81 toks/s, output: 3782.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 347/377 [00:23<00:00, 33.98it/s, est. speed input: 6963.60 toks/s, output: 3845.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 351/377 [00:23<00:00, 32.91it/s, est. speed input: 7005.19 toks/s, output: 3907.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 355/377 [00:24<00:01, 20.03it/s, est. speed input: 6954.17 toks/s, output: 3926.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 358/377 [00:24<00:01, 11.01it/s, est. speed input: 6804.23 toks/s, output: 3881.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:26<00:03, 4.81it/s, est. speed input: 6396.69 toks/s, output: 3701.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 361/377 [00:38<00:03, 4.81it/s, est. speed input: 6396.69 toks/s, output: 3701.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 362/377 [00:38<00:20, 1.39s/it, est. speed input: 4392.65 toks/s, output: 2591.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 363/377 [00:49<00:36, 2.58s/it, est. speed input: 3433.35 toks/s, output: 2087.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 364/377 [00:57<00:42, 3.24s/it, est. speed input: 3017.13 toks/s, output: 1901.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [01:01<00:33, 3.01s/it, est. speed input: 2792.49 toks/s, output: 1897.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 367/377 [01:05<00:31, 3.10s/it, est. speed input: 2648.69 toks/s, output: 1871.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [01:10<00:30, 3.40s/it, est. speed input: 2482.21 toks/s, output: 1827.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 375/377 [01:10<00:02, 1.14s/it, est. speed input: 2526.62 toks/s, output: 2373.33 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:10<00:00, 5.37it/s, est. speed input: 2539.12 toks/s, output: 2529.19 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.1815774291753769, 'actor/pg_clipfrac': 0.0011947430903092027, 'actor/ppo_kl': 0.001295634312555194}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0002526159805711359, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001874847657745704}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.057589393109083176, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 8.37730331113562e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.10895412415266037, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002458130766171962}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.09638116508722305, 'actor/pg_clipfrac': 0.0019267823081463575, 'actor/ppo_kl': 0.0017345185624435544}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.12382439523935318, 'actor/pg_clipfrac': 0.0017899761442095041, 'actor/ppo_kl': -7.254753290908411e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0005974539089947939, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006949898670427501}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2047264724969864, 'actor/pg_clipfrac': 0.0016420361353084445, 'actor/ppo_kl': 0.000301848107483238}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.5056794285774231, 'actor/pg_clipfrac': 0.0009354536887258291, 'actor/ppo_kl': -0.0017937481170520186}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 1.0609652996063232, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00026291224639862776}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00042902439599856734, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003390243509784341}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00034995016176253557, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001997367711737752}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002398013893980533, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00029175778036005795}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.09075652807950974, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018179792677983642}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.17150962352752686, 'actor/pg_clipfrac': 0.0011154490057379007, 'actor/ppo_kl': -2.6507146685617045e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.05623667314648628, 'actor/pg_clipfrac': 0.00046168052358552814, 'actor/ppo_kl': 0.0013768274802714586}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0001944806717801839, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002728680265136063}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.4082217812538147, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010981736704707146}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1808289736509323, 'actor/pg_clipfrac': 0.0026785715017467737, 'actor/ppo_kl': -0.00034777267137542367}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.22297516465187073, 'actor/pg_clipfrac': 0.0008156606927514076, 'actor/ppo_kl': 4.816288492293097e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.44825682044029236, 'actor/pg_clipfrac': 0.0031796502880752087, 'actor/ppo_kl': -0.0005879925447516143}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.22841857373714447, 'actor/pg_clipfrac': 0.0005115089588798583, 'actor/ppo_kl': 0.000532465404830873}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.10025478899478912, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009630391141399741}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.40715107321739197, 'actor/pg_clipfrac': 0.0021739129442721605, 'actor/ppo_kl': 0.00017095441580750048}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.46594005823135376, 'actor/pg_clipfrac': 0.0006816632812842727, 'actor/ppo_kl': 0.0007636207155883312}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0601884126663208, 'actor/pg_clipfrac': 0.0008467400330118835, 'actor/ppo_kl': 0.00016899658658076078}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00026714676641859114, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001669586286880076}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0001648842153372243, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011674542911350727}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:12:57 [executor_base.py:219] It took 0.338799 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.66 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:14:49 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:12:57 [executor_base.py:219] It took 0.342884 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:14:49 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:14:49 [executor_base.py:208] It took 0.327870 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:14:50 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:14:51 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:14:51 [executor_base.py:208] It took 0.325869 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to consider the geometric properties and the given conditions. Since point D is the midpoint of AB and BD = 2.0, it implies that AD = 2.0 as well. The line l is a vertical line passing through D, and E is any point on l. The perimeter of triangle AEC is the sum of the lengths of AE, EC, and AC.\n\nSince E is on the vertical line l and D is the midpoint of AB, the shortest distance from A to E is the length of AD, which is 2.0. The length of AC is constant and does not change. The length of EC is minimized when E is at the point where the line from A perpendicular to l intersects l, which is the same as the distance from A to the line l, which is the same as the distance from A to D since D is the midpoint.\n\nThus, the minimum perimeter of triangle AEC is the sum of the minimum lengths of AE and EC, which is 2.0 + 2.0 = 4.0. However, since AC is a constant and does not change, the minimum perimeter of triangle AEC is simply the minimum of AE + EC, which is 2.0 + 2.0 = 4.0.\n\nTherefore, the minimum perimeter of triangle AEC is 4.0.\n\n\n4.0\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated clockwise around point A by 90.0 degrees to obtain triangle AB'C'. This means that angle BAC = 90.0 degrees, and since the rotation is 90.0 degrees, angle B'AC' = 90.0 degrees as well. \n\nGiven that angle CC'B' = 32.0 degrees, we can deduce that angle AC'C' = 90.0 - 32.0 = 58.0 degrees because the angle sum in a triangle is 180 degrees.\n\nSince angle AC'B' is part of triangle AB'C', and we know angle AC'C' = 58.0 degrees, we can find angle AC'B' by subtracting angle AC'C' from 90.0 degrees (since angle BAC' = 90.0 degrees).\n\nTherefore, angle AC'B' = 90.0 - 58.0 = 32.0 degrees.\n\nSo, the size of angle AC'B' is 32.0 degrees.\n\n\n32.0\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that A, B, and D are on circle O, and C is on the diameter BE, we know that angle ADC is given as 54.0 degrees. Since C is on the diameter BE, angle BAC is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Now, we need to find angle AEB. Since angle ADC is 54.0 degrees, angle BAC can be found by subtracting angle ADC from 90 degrees, giving us angle BAC = 90 - 54 = 36 degrees. In triangle ABE, angle AEB is an angle of the triangle, and since the sum of angles in a triangle is 180 degrees, we can find angle AEB by subtracting the sum of the other two angles (angle BAC and angle AEB) from 180 degrees. Since angle BAC = 36 degrees and angle AEB is the angle we are looking for, we can write the equation as follows:\n\\[ \\text{angle AEB} = 180 - \\text{angle BAC} - \\text{angle AEB} \\]\nSince angle BAC = 36 degrees, we substitute and solve for angle AEB:\n\\[ \\text{angle AEB} = 180 - 36 = 144 \\]\nHowever, we need to correct this as angle BAC is actually 36 degrees, not 90 - 54 = 36 degrees. So, we should directly use the fact that angle BAC is 36 degrees and angle AEB is the angle we are looking for:\n\\[ \\text{angle AEB} = 180 - 90 - 36 = 54 \\]\nTherefore, the degree of angle AEB is 54 degrees.\n\n\n54\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo solve for angle C, we can use the properties of circles and tangents. Since AB is the diameter of circle O, angle ADB is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. Given that angle A is 35 degrees, we can find angle D by subtracting angle A from 90 degrees. Then, we can use the fact that the tangent line CD is perpendicular to the radius OD at the point of tangency D, which means angle ODC is also 90 degrees. Therefore, angle C can be found by subtracting angle D from 90 degrees.\n\n\nTo find angle C, we first calculate angle D:\nangle D = 90 degrees - angle A = 90 degrees - 35 degrees = 55 degrees.\nSince angle ODC is 90 degrees and angle D is 55 degrees, angle C can be found as:\nangle C = 90 degrees - angle D = 90 degrees - 55 degrees = 35 degrees.\nTherefore, angle C is 35 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:35:02, 4.47s/it, est. speed input: 101.06 toks/s, output: 27.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<46:44, 2.21s/it, est. speed input: 177.01 toks/s, output: 52.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:28<31:05, 1.47s/it, est. speed input: 239.38 toks/s, output: 74.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:29<21:03, 1.00s/it, est. speed input: 303.02 toks/s, output: 97.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<13:39, 1.53it/s, est. speed input: 373.03 toks/s, output: 122.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<10:47, 1.93it/s, est. speed input: 430.51 toks/s, output: 138.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<08:23, 2.47it/s, est. speed input: 486.90 toks/s, output: 160.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:32<05:55, 3.49it/s, est. speed input: 555.56 toks/s, output: 182.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<04:50, 4.25it/s, est. speed input: 616.59 toks/s, output: 204.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<03:33, 5.75it/s, est. speed input: 729.73 toks/s, output: 245.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:34<02:29, 8.15it/s, est. speed input: 853.83 toks/s, output: 288.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:34<02:07, 9.47it/s, est. speed input: 917.43 toks/s, output: 313.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:35<01:58, 10.16it/s, est. speed input: 974.19 toks/s, output: 334.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:35<01:41, 11.79it/s, est. speed input: 1028.77 toks/s, output: 356.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:09, 17.10it/s, est. speed input: 1153.60 toks/s, output: 405.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:35<01:01, 19.30it/s, est. speed input: 1212.53 toks/s, output: 427.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:35<00:43, 26.93it/s, est. speed input: 1333.68 toks/s, output: 476.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:35<00:41, 28.13it/s, est. speed input: 1395.35 toks/s, output: 497.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:36<00:47, 24.28it/s, est. speed input: 1447.93 toks/s, output: 513.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:36<00:51, 22.31it/s, est. speed input: 1552.82 toks/s, output: 552.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:37<00:58, 19.69it/s, est. speed input: 1600.11 toks/s, output: 569.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:37<00:51, 22.07it/s, est. speed input: 1655.30 toks/s, output: 589.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:37<00:36, 30.86it/s, est. speed input: 1772.34 toks/s, output: 639.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:37<00:40, 27.92it/s, est. speed input: 1875.97 toks/s, output: 685.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:37<00:31, 35.53it/s, est. speed input: 1986.14 toks/s, output: 727.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:38<00:30, 35.81it/s, est. speed input: 2042.32 toks/s, output: 753.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:38<00:21, 52.06it/s, est. speed input: 2212.93 toks/s, output: 827.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:38<00:21, 51.52it/s, est. speed input: 2321.00 toks/s, output: 868.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:38<00:27, 39.37it/s, est. speed input: 2414.85 toks/s, output: 907.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:38<00:27, 39.35it/s, est. speed input: 2464.31 toks/s, output: 928.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:39<00:22, 47.80it/s, est. speed input: 2578.72 toks/s, output: 984.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:39<00:26, 38.96it/s, est. speed input: 2675.40 toks/s, output: 1036.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:39<00:22, 47.12it/s, est. speed input: 2787.71 toks/s, output: 1090.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:40<00:31, 32.44it/s, est. speed input: 2864.54 toks/s, output: 1133.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:40<00:25, 40.28it/s, est. speed input: 2967.77 toks/s, output: 1179.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:40<00:13, 71.98it/s, est. speed input: 3300.00 toks/s, output: 1340.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:40<00:20, 48.98it/s, est. speed input: 3379.20 toks/s, output: 1387.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:41<00:22, 43.40it/s, est. speed input: 3463.33 toks/s, output: 1428.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:41<00:21, 44.76it/s, est. speed input: 3605.86 toks/s, output: 1500.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:41<00:21, 43.10it/s, est. speed input: 3688.50 toks/s, output: 1550.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:41<00:14, 63.00it/s, est. speed input: 3901.74 toks/s, output: 1657.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:42<00:14, 61.76it/s, est. speed input: 4044.74 toks/s, output: 1727.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:42<00:12, 69.82it/s, est. speed input: 4192.39 toks/s, output: 1808.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:42<00:13, 64.36it/s, est. speed input: 4279.69 toks/s, output: 1855.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:42<00:16, 52.94it/s, est. speed input: 4363.95 toks/s, output: 1894.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:42<00:17, 49.41it/s, est. speed input: 4444.09 toks/s, output: 1934.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:43<00:17, 50.26it/s, est. speed input: 4535.04 toks/s, output: 1993.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:43<00:13, 62.68it/s, est. speed input: 4680.42 toks/s, output: 2073.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:43<00:13, 60.78it/s, est. speed input: 4815.23 toks/s, output: 2155.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:43<00:10, 74.89it/s, est. speed input: 5001.07 toks/s, output: 2282.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:43<00:10, 75.84it/s, est. speed input: 5094.04 toks/s, output: 2330.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:44<00:13, 58.95it/s, est. speed input: 5165.87 toks/s, output: 2383.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:44<00:13, 55.76it/s, est. speed input: 5246.00 toks/s, output: 2424.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:44<00:12, 60.63it/s, est. speed input: 5335.49 toks/s, output: 2477.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:44<00:14, 50.63it/s, est. speed input: 5403.78 toks/s, output: 2510.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:44<00:13, 56.94it/s, est. speed input: 5496.44 toks/s, output: 2554.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:45<00:17, 41.99it/s, est. speed input: 5556.79 toks/s, output: 2584.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:45<00:15, 46.81it/s, est. speed input: 5638.81 toks/s, output: 2628.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:45<00:10, 67.45it/s, est. speed input: 5817.43 toks/s, output: 2738.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:45<00:07, 90.73it/s, est. speed input: 6007.38 toks/s, output: 2852.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:46<00:10, 66.40it/s, est. speed input: 6108.65 toks/s, output: 2931.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:46<00:12, 54.85it/s, est. speed input: 6168.38 toks/s, output: 2976.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:46<00:08, 76.90it/s, est. speed input: 6393.70 toks/s, output: 3136.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:47<00:12, 49.98it/s, est. speed input: 6459.52 toks/s, output: 3197.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:47<00:09, 61.65it/s, est. speed input: 6637.54 toks/s, output: 3309.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:47<00:09, 64.38it/s, est. speed input: 6754.77 toks/s, output: 3387.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:47<00:08, 66.93it/s, est. speed input: 6872.60 toks/s, output: 3447.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:47<00:07, 76.18it/s, est. speed input: 6996.54 toks/s, output: 3525.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:47<00:07, 77.09it/s, est. speed input: 7078.88 toks/s, output: 3580.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:48<00:08, 60.07it/s, est. speed input: 7131.11 toks/s, output: 3627.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:48<00:09, 57.58it/s, est. speed input: 7230.44 toks/s, output: 3712.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:48<00:05, 84.71it/s, est. speed input: 7496.88 toks/s, output: 3919.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:48<00:05, 91.31it/s, est. speed input: 7612.64 toks/s, output: 3990.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:49<00:05, 79.65it/s, est. speed input: 7716.41 toks/s, output: 4083.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:49<00:05, 82.93it/s, est. speed input: 7792.64 toks/s, output: 4157.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:49<00:04, 92.54it/s, est. speed input: 7912.40 toks/s, output: 4257.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:49<00:03, 106.28it/s, est. speed input: 8085.72 toks/s, output: 4413.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:49<00:02, 142.44it/s, est. speed input: 8337.06 toks/s, output: 4629.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:49<00:02, 138.02it/s, est. speed input: 8493.71 toks/s, output: 4776.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:49<00:03, 102.75it/s, est. speed input: 8589.37 toks/s, output: 4866.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:50<00:03, 103.12it/s, est. speed input: 8703.82 toks/s, output: 4976.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:50<00:02, 121.87it/s, est. speed input: 8909.56 toks/s, output: 5144.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:50<00:02, 98.62it/s, est. speed input: 9006.66 toks/s, output: 5233.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:50<00:02, 105.68it/s, est. speed input: 9129.57 toks/s, output: 5321.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:50<00:02, 116.05it/s, est. speed input: 9283.23 toks/s, output: 5465.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:50<00:02, 108.70it/s, est. speed input: 9391.39 toks/s, output: 5561.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:51<00:02, 90.57it/s, est. speed input: 9481.61 toks/s, output: 5652.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:51<00:01, 120.08it/s, est. speed input: 9691.97 toks/s, output: 5836.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:51<00:02, 90.82it/s, est. speed input: 9776.98 toks/s, output: 5932.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:51<00:02, 85.83it/s, est. speed input: 9871.80 toks/s, output: 6025.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:51<00:01, 105.35it/s, est. speed input: 10034.99 toks/s, output: 6205.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:51<00:01, 99.68it/s, est. speed input: 10132.35 toks/s, output: 6305.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:52<00:01, 87.39it/s, est. speed input: 10225.91 toks/s, output: 6408.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:52<00:01, 61.97it/s, est. speed input: 10274.59 toks/s, output: 6483.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:52<00:01, 63.52it/s, est. speed input: 10342.51 toks/s, output: 6564.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:53<00:01, 51.93it/s, est. speed input: 10367.19 toks/s, output: 6604.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:53<00:01, 58.09it/s, est. speed input: 10429.62 toks/s, output: 6664.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:53<00:01, 59.75it/s, est. speed input: 10482.96 toks/s, output: 6746.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:53<00:01, 36.82it/s, est. speed input: 10469.89 toks/s, output: 6782.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:54<00:01, 27.62it/s, est. speed input: 10447.98 toks/s, output: 6864.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:54<00:01, 29.14it/s, est. speed input: 10463.31 toks/s, output: 6888.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:55<00:00, 31.27it/s, est. speed input: 10500.15 toks/s, output: 6967.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:55<00:00, 37.79it/s, est. speed input: 10558.27 toks/s, output: 7055.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:57<00:00, 11.30it/s, est. speed input: 10212.34 toks/s, output: 6893.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:57<00:00, 11.19it/s, est. speed input: 10172.82 toks/s, output: 6879.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:58<00:00, 8.94it/s, est. speed input: 10036.34 toks/s, output: 6815.27 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:58<00:00, 21.70it/s, est. speed input: 10036.34 toks/s, output: 6815.27 toks/s]
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, we know that angle ACD is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. \n\nSince EA is a tangent to circle O at point A, angle EAD is also a right angle (90 degrees) because a tangent to a circle is perpendicular to the radius at the point of tangency.\n\nGiven that angle EAC = 120 degrees, we can find angle CAD by subtracting the right angles from angle EAC:\n\\[ \\angle CAD = 120^\\circ - 90^\\circ = 30^\\circ \\]\n\nSince angle ACD is 90 degrees, triangle ACD is a right triangle with angle CAD at 30 degrees. In a right triangle, the sum of the angles is 90 degrees. Therefore, angle ACB must be:\n\\[ \\angle ACB = 90^\\circ - \\angle CAD = 90^\\circ - 30^\\circ = 60^\\circ \\]\n\nSince angle ACB is an angle at the circumference opposite to angle ABC in the same segment, angle ABC is equal to angle ACB. Therefore, angle ABC is 60 degrees.\n\nThus, the degree of angle ABC is 60 degrees.\n\n\n60\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_40
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_55/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_55/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_55/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 55
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.228
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.023
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.013
[36m(Runner pid=3309020)[0m ppo_kl: 2.453941379343405e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.019
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.019
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.667
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.667
[36m(Runner pid=3309020)[0m min: 0.1
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 999873
[36m(Runner pid=3309020)[0m balanced_min: 999873
[36m(Runner pid=3309020)[0m max: 1002386
[36m(Runner pid=3309020)[0m mean: 999873.0
[36m(Runner pid=3309020)[0m min: 997360
[36m(Runner pid=3309020)[0m minmax_diff: 5026
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.761
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.123
[36m(Runner pid=3309020)[0m throughput: 944.856
[36m(Runner pid=3309020)[0m time_per_step: 1058.228
[36m(Runner pid=3309020)[0m total_num_tokens: 1999746
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 631.0
[36m(Runner pid=3309020)[0m mean: 463.484
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1949.0
[36m(Runner pid=3309020)[0m mean: 317.666
[36m(Runner pid=3309020)[0m min: 58.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.335
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.667
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.510017072844895e-05
[36m(Runner pid=3309020)[0m gen: 0.135
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.281
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.15
[36m(Runner pid=3309020)[0m gen: 110.046
[36m(Runner pid=3309020)[0m old: 87.872
[36m(Runner pid=3309020)[0m ref: 88.524
[36m(Runner pid=3309020)[0m reward: 6.741
[36m(Runner pid=3309020)[0m save_checkpoint: 32.252
[36m(Runner pid=3309020)[0m step: 1058.228
[36m(Runner pid=3309020)[0m update_actor: 562.775
[36m(Runner pid=3309020)[0m validation: 169.245
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.401
[36m(Runner pid=3309020)[0m format_reward: 0.969
[36m(Runner pid=3309020)[0m overall_reward: 0.687
[36m(Runner pid=3309020)[0m reward_score: 0.687
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.977
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_55/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_55/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_55/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 56; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:15:44 [executor_base.py:219] It took 0.338578 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:17:15 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:15:44 [executor_base.py:219] It took 0.340496 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:17:15 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:17:15 [executor_base.py:208] It took 0.327506 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.71 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.79 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:17:16 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:17:17 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.79 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:17:17 [executor_base.py:208] It took 0.325298 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.47053301334381104, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0004813412087969482, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006656438927166164}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.000264315924141556, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005288677057251334}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.34626132249832153, 'actor/pg_clipfrac': 0.0010389609960839152, 'actor/ppo_kl': -0.0001318022550549358}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.21233877539634705, 'actor/pg_clipfrac': 0.0027027027681469917, 'actor/ppo_kl': 0.0005478420644067228}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.18638020753860474, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.26711589097976685, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0009509023511782289, 'actor/pg_clipfrac': 0.0028653296176344156, 'actor/ppo_kl': 0.0010480514029040933}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.07529324293136597, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.35673537850379944, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.26062482595443726, 'actor/pg_clipfrac': 0.001607717014849186, 'actor/ppo_kl': -0.0014090215554460883}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.1446714848279953, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0003681050438899547, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017945318249985576}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.10177826136350632, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002371962880715728}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.34855297207832336, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.14384876191616058, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00022078760957811028}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.14027968049049377, 'actor/pg_clipfrac': 0.0011771629797294736, 'actor/ppo_kl': -0.0004195448418613523}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.4616806209087372, 'actor/pg_clipfrac': 0.0027573530096560717, 'actor/ppo_kl': 0.001398076070472598}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.33083778619766235, 'actor/pg_clipfrac': 0.007163323927670717, 'actor/ppo_kl': -0.0029792676214128733}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0002173278626287356, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013915464514866471}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.4568103551864624, 'actor/pg_clipfrac': 0.0006191950524225831, 'actor/ppo_kl': -0.00031631754245609045}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.24758705496788025, 'actor/pg_clipfrac': 0.0015082956524565816, 'actor/ppo_kl': 0.0010642439592629671}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.019315944984555244, 'actor/pg_clipfrac': 0.0005086470046080649, 'actor/ppo_kl': 0.0006384175503626466}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00025990730500780046, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.874329847050831e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2948591709136963, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002774263557512313}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.302854061126709, 'actor/pg_clipfrac': 0.0019102196674793959, 'actor/ppo_kl': 0.0025648155715316534}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00044253270607441664, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.5094226000655908e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.44804418087005615, 'actor/pg_clipfrac': 0.003980891779065132, 'actor/ppo_kl': -0.0017806602409109473}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0003975004656240344, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000722961500287056}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00037057619192637503, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00039973066304810345}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.3893283009529114, 'actor/pg_clipfrac': 0.0015847861068323255, 'actor/ppo_kl': 0.0009069624356925488}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.03505106642842293, 'actor/pg_clipfrac': 0.0007067137630656362, 'actor/ppo_kl': -0.0006161059718579054}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002713943540584296, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013724229065701365}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.3397200405597687, 'actor/pg_clipfrac': 0.0009107468067668378, 'actor/ppo_kl': 0.00035887336707673967}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.09132932126522064, 'actor/pg_clipfrac': 0.0013306719483807683, 'actor/ppo_kl': 0.001124574220739305}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.4168216586112976, 'actor/pg_clipfrac': 0.0007880220655351877, 'actor/ppo_kl': -0.0004421486519277096}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.44797056913375854, 'actor/pg_clipfrac': 0.001721170381642878, 'actor/ppo_kl': -0.00032699209987185895}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.0704425498843193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -5.2258892537793145e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.053687017410993576, 'actor/pg_clipfrac': 0.0009157509193755686, 'actor/ppo_kl': 0.00019540367065928876}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.008723143488168716, 'actor/pg_clipfrac': 0.0026420080102980137, 'actor/ppo_kl': -0.0006265703123062849}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.24380657076835632, 'actor/pg_clipfrac': 0.003071252955123782, 'actor/ppo_kl': -0.0004520539368968457}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.44847893714904785, 'actor/pg_clipfrac': 0.0021505376789718866, 'actor/ppo_kl': -0.0005680330214090645}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.18763159215450287, 'actor/pg_clipfrac': 0.004149377811700106, 'actor/ppo_kl': 0.0006056128768250346}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2204188108444214, 'actor/pg_clipfrac': 0.0007209805189631879, 'actor/ppo_kl': 0.0003880320000462234}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.13227082788944244, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013966378755867481}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.2567865252494812, 'actor/pg_clipfrac': 0.002072538947686553, 'actor/ppo_kl': 0.0008073204080574214}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00029642917797900736, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009180867928080261}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.2655700743198395, 'actor/pg_clipfrac': 0.0017590149072930217, 'actor/ppo_kl': -0.0014075880171731114}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00038355248398147523, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009533935808576643}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.4475153386592865, 'actor/pg_clipfrac': 0.002685765502974391, 'actor/ppo_kl': -0.0008078022510744631}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.9486411213874817, 'actor/pg_clipfrac': 0.0016051364364102483, 'actor/ppo_kl': 0.0007967737037688494}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.02269609272480011, 'actor/pg_clipfrac': 0.002772643230855465, 'actor/ppo_kl': -0.000698586693033576}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.24427418410778046, 'actor/pg_clipfrac': 0.003105590119957924, 'actor/ppo_kl': -0.0009048207430168986}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.3878938555717468, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002832020109053701}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.605254590511322, 'actor/pg_clipfrac': 0.0008149959030561149, 'actor/ppo_kl': -0.0005202103056944907}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.5950736403465271, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.787607501086313e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 1.2123870849609375, 'actor/pg_clipfrac': 0.0007861634949222207, 'actor/ppo_kl': -0.0005131592624820769}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00021211472630966455, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001151621458120644}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3122149407863617, 'actor/pg_clipfrac': 0.0005854800692759454, 'actor/ppo_kl': 0.0016018947353586555}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00048967229668051, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0029842343647032976}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.10023120045661926, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 1.8530683973949635e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.46275171637535095, 'actor/pg_clipfrac': 0.003372681327164173, 'actor/ppo_kl': 0.001011100597679615}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00043740274850279093, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005638384609483182}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.1821122020483017, 'actor/pg_clipfrac': 0.0017937219236046076, 'actor/ppo_kl': -0.00015509566583205014}
[36m(Runner pid=3309020)[0m Step 56
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.266
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.049
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: 1.9515883696286095e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.681
[36m(Runner pid=3309020)[0m min: 0.5
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.681
[36m(Runner pid=3309020)[0m min: 0.5
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 990365
[36m(Runner pid=3309020)[0m balanced_min: 990364
[36m(Runner pid=3309020)[0m max: 993869
[36m(Runner pid=3309020)[0m mean: 990364.5
[36m(Runner pid=3309020)[0m min: 986860
[36m(Runner pid=3309020)[0m minmax_diff: 7009
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.812
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.122
[36m(Runner pid=3309020)[0m throughput: 1160.204
[36m(Runner pid=3309020)[0m time_per_step: 853.612
[36m(Runner pid=3309020)[0m total_num_tokens: 1980729
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 695.0
[36m(Runner pid=3309020)[0m mean: 464.01
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1583.0
[36m(Runner pid=3309020)[0m mean: 309.712
[36m(Runner pid=3309020)[0m min: 55.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.362
[36m(Runner pid=3309020)[0m format: 0.999
[36m(Runner pid=3309020)[0m overall: 0.681
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.000292269130366e-05
[36m(Runner pid=3309020)[0m gen: 0.136
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.284
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.158
[36m(Runner pid=3309020)[0m gen: 107.689
[36m(Runner pid=3309020)[0m old: 87.979
[36m(Runner pid=3309020)[0m ref: 88.303
[36m(Runner pid=3309020)[0m reward: 6.312
[36m(Runner pid=3309020)[0m step: 853.612
[36m(Runner pid=3309020)[0m update_actor: 562.453
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 57; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:30:01 [executor_base.py:219] It took 0.338978 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:20<1:26:15, 4.06s/it, est. speed input: 113.82 toks/s, output: 27.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<50:03, 2.37s/it, est. speed input: 170.48 toks/s, output: 40.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<28:18, 1.34s/it, est. speed input: 252.82 toks/s, output: 63.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:27<18:57, 1.11it/s, est. speed input: 324.51 toks/s, output: 84.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<16:31, 1.27it/s, est. speed input: 367.98 toks/s, output: 101.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<11:48, 1.77it/s, est. speed input: 432.27 toks/s, output: 124.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<09:20, 2.22it/s, est. speed input: 487.97 toks/s, output: 142.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<07:40, 2.70it/s, est. speed input: 537.07 toks/s, output: 156.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:35<05:29, 3.74it/s, est. speed input: 644.31 toks/s, output: 194.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<03:25, 5.92it/s, est. speed input: 760.04 toks/s, output: 235.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<02:47, 7.25it/s, est. speed input: 822.36 toks/s, output: 259.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:20, 8.55it/s, est. speed input: 931.43 toks/s, output: 296.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:36<01:57, 10.20it/s, est. speed input: 987.48 toks/s, output: 314.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:37<01:55, 10.38it/s, est. speed input: 1039.54 toks/s, output: 333.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:37<01:18, 15.02it/s, est. speed input: 1158.85 toks/s, output: 382.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<01:16, 15.35it/s, est. speed input: 1207.07 toks/s, output: 401.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:37<00:59, 19.62it/s, est. speed input: 1314.05 toks/s, output: 447.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:49, 23.31it/s, est. speed input: 1417.33 toks/s, output: 501.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:43, 26.41it/s, est. speed input: 1523.13 toks/s, output: 549.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<00:44, 25.93it/s, est. speed input: 1576.35 toks/s, output: 573.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:44, 25.63it/s, est. speed input: 1630.96 toks/s, output: 597.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:39<00:53, 21.22it/s, est. speed input: 1721.26 toks/s, output: 636.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:39<00:54, 20.68it/s, est. speed input: 1772.56 toks/s, output: 660.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:44, 25.05it/s, est. speed input: 1874.99 toks/s, output: 706.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:40<00:40, 27.37it/s, est. speed input: 1926.02 toks/s, output: 731.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:40<00:32, 33.92it/s, est. speed input: 2025.31 toks/s, output: 777.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:40<00:21, 49.25it/s, est. speed input: 2245.47 toks/s, output: 882.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:19, 55.50it/s, est. speed input: 2354.33 toks/s, output: 930.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:18, 55.84it/s, est. speed input: 2458.34 toks/s, output: 978.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:20, 51.55it/s, est. speed input: 2561.18 toks/s, output: 1030.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:41<00:23, 45.18it/s, est. speed input: 2651.66 toks/s, output: 1068.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:41<00:25, 41.27it/s, est. speed input: 2697.71 toks/s, output: 1090.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:41<00:35, 28.63it/s, est. speed input: 2728.15 toks/s, output: 1100.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:42<00:24, 41.91it/s, est. speed input: 2919.58 toks/s, output: 1187.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:42<00:27, 36.92it/s, est. speed input: 2961.98 toks/s, output: 1211.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:42<00:25, 39.64it/s, est. speed input: 3059.45 toks/s, output: 1257.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:42<00:25, 38.08it/s, est. speed input: 3102.46 toks/s, output: 1283.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:42<00:22, 44.25it/s, est. speed input: 3199.19 toks/s, output: 1324.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:42<00:21, 44.97it/s, est. speed input: 3247.07 toks/s, output: 1346.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:43<00:17, 55.83it/s, est. speed input: 3346.62 toks/s, output: 1383.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:43<00:13, 69.02it/s, est. speed input: 3489.47 toks/s, output: 1446.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:43<00:13, 68.10it/s, est. speed input: 3587.46 toks/s, output: 1498.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:43<00:11, 77.91it/s, est. speed input: 3735.11 toks/s, output: 1588.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:43<00:12, 74.87it/s, est. speed input: 3828.36 toks/s, output: 1637.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:43<00:14, 60.90it/s, est. speed input: 3913.53 toks/s, output: 1681.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:44<00:14, 62.90it/s, est. speed input: 4005.42 toks/s, output: 1738.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:44<00:11, 74.10it/s, est. speed input: 4149.87 toks/s, output: 1800.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:44<00:13, 66.23it/s, est. speed input: 4277.56 toks/s, output: 1867.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:44<00:12, 68.44it/s, est. speed input: 4375.46 toks/s, output: 1917.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:44<00:10, 76.76it/s, est. speed input: 4556.45 toks/s, output: 2026.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:10, 76.77it/s, est. speed input: 4643.67 toks/s, output: 2067.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:45<00:13, 61.98it/s, est. speed input: 4765.44 toks/s, output: 2133.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:45<00:10, 73.14it/s, est. speed input: 4902.73 toks/s, output: 2211.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:45<00:10, 74.70it/s, est. speed input: 4987.13 toks/s, output: 2259.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:45<00:10, 76.41it/s, est. speed input: 5079.69 toks/s, output: 2304.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:45<00:09, 77.93it/s, est. speed input: 5177.61 toks/s, output: 2360.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:45<00:11, 67.59it/s, est. speed input: 5259.02 toks/s, output: 2415.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:46<00:09, 75.12it/s, est. speed input: 5390.85 toks/s, output: 2507.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:46<00:09, 77.23it/s, est. speed input: 5480.27 toks/s, output: 2569.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:46<00:11, 65.34it/s, est. speed input: 5553.48 toks/s, output: 2612.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:46<00:10, 64.09it/s, est. speed input: 5669.52 toks/s, output: 2684.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:46<00:09, 70.45it/s, est. speed input: 5761.73 toks/s, output: 2740.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:46<00:09, 71.71it/s, est. speed input: 5840.09 toks/s, output: 2795.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:47<00:08, 82.12it/s, est. speed input: 6011.31 toks/s, output: 2904.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:47<00:10, 65.23it/s, est. speed input: 6075.62 toks/s, output: 2947.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:47<00:08, 77.18it/s, est. speed input: 6205.80 toks/s, output: 3031.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:47<00:05, 109.71it/s, est. speed input: 6484.14 toks/s, output: 3206.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:47<00:04, 122.92it/s, est. speed input: 6648.55 toks/s, output: 3314.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:47<00:05, 110.26it/s, est. speed input: 6772.89 toks/s, output: 3402.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:48<00:06, 93.29it/s, est. speed input: 6884.86 toks/s, output: 3487.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:48<00:05, 91.76it/s, est. speed input: 7002.77 toks/s, output: 3552.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:48<00:06, 85.30it/s, est. speed input: 7080.72 toks/s, output: 3620.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:48<00:05, 90.05it/s, est. speed input: 7202.27 toks/s, output: 3717.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:48<00:04, 113.48it/s, est. speed input: 7411.17 toks/s, output: 3877.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:49<00:05, 94.30it/s, est. speed input: 7518.52 toks/s, output: 3949.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:49<00:05, 91.00it/s, est. speed input: 7652.69 toks/s, output: 4032.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:49<00:04, 98.45it/s, est. speed input: 7814.84 toks/s, output: 4149.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:49<00:03, 130.22it/s, est. speed input: 8073.14 toks/s, output: 4329.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:49<00:02, 132.14it/s, est. speed input: 8259.45 toks/s, output: 4476.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:03, 115.92it/s, est. speed input: 8369.75 toks/s, output: 4554.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:50<00:03, 109.80it/s, est. speed input: 8483.33 toks/s, output: 4615.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:50<00:03, 107.15it/s, est. speed input: 8631.30 toks/s, output: 4752.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:50<00:03, 86.76it/s, est. speed input: 8726.08 toks/s, output: 4842.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:50<00:04, 75.15it/s, est. speed input: 8776.00 toks/s, output: 4893.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:50<00:03, 83.76it/s, est. speed input: 8895.12 toks/s, output: 4999.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:50<00:03, 86.34it/s, est. speed input: 8968.48 toks/s, output: 5057.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:51<00:03, 73.88it/s, est. speed input: 9053.91 toks/s, output: 5151.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:51<00:03, 82.54it/s, est. speed input: 9164.96 toks/s, output: 5257.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:51<00:04, 62.22it/s, est. speed input: 9206.90 toks/s, output: 5310.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:51<00:03, 63.20it/s, est. speed input: 9304.48 toks/s, output: 5388.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:51<00:03, 74.90it/s, est. speed input: 9412.26 toks/s, output: 5511.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:52<00:03, 68.57it/s, est. speed input: 9467.33 toks/s, output: 5559.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:52<00:02, 83.01it/s, est. speed input: 9587.82 toks/s, output: 5673.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:52<00:02, 78.32it/s, est. speed input: 9649.97 toks/s, output: 5752.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:52<00:02, 77.40it/s, est. speed input: 9714.88 toks/s, output: 5818.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:52<00:02, 77.18it/s, est. speed input: 9772.80 toks/s, output: 5890.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:52<00:01, 89.28it/s, est. speed input: 9912.30 toks/s, output: 6036.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:53<00:01, 71.24it/s, est. speed input: 9956.03 toks/s, output: 6108.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:53<00:02, 59.67it/s, est. speed input: 9996.68 toks/s, output: 6153.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:53<00:01, 83.87it/s, est. speed input: 10140.31 toks/s, output: 6306.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:53<00:01, 75.08it/s, est. speed input: 10228.62 toks/s, output: 6416.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:53<00:00, 86.01it/s, est. speed input: 10333.95 toks/s, output: 6517.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:54<00:00, 69.55it/s, est. speed input: 10411.53 toks/s, output: 6652.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:54<00:01, 53.36it/s, est. speed input: 10437.59 toks/s, output: 6716.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:54<00:01, 37.22it/s, est. speed input: 10426.52 toks/s, output: 6754.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:55<00:01, 29.78it/s, est. speed input: 10409.66 toks/s, output: 6783.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:55<00:01, 24.79it/s, est. speed input: 10376.74 toks/s, output: 6780.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:56<00:00, 27.18it/s, est. speed input: 10409.92 toks/s, output: 6864.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:56<00:00, 29.59it/s, est. speed input: 10429.17 toks/s, output: 6922.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:56<00:00, 37.43it/s, est. speed input: 10484.44 toks/s, output: 7028.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 22.35it/s, est. speed input: 10362.50 toks/s, output: 6999.34 toks/s]
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:31:29 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:30:01 [executor_base.py:219] It took 0.340021 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:31:29 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:31:29 [executor_base.py:208] It took 0.325294 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:31:30 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:31:30 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:31:30 [executor_base.py:208] It took 0.327999 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.24403028190135956, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002978073316626251, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004333076358307153}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00020288133237045258, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00030290408176369965, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.19847196340560913, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0004418391617946327, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006607475806958973}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.11649024486541748, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.5258920788764954, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.4417005181312561, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.047825343906879425, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010957363992929459}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.13497225940227509, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.6365951895713806, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.16009512543678284, 'actor/pg_clipfrac': 0.001215066877193749, 'actor/ppo_kl': -0.0002595987170934677}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0005075493827462196, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008850141311995685}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0002648297813721001, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007871384150348604}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002977683034259826, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.7484400868415833, 'actor/pg_clipfrac': 0.001088139251805842, 'actor/ppo_kl': 0.00019614974735304713}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0032013515010476112, 'actor/pg_clipfrac': 0.0020408162381500006, 'actor/ppo_kl': 0.0013958561467006803}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.03780541941523552, 'actor/pg_clipfrac': 0.001377410488203168, 'actor/ppo_kl': 2.3673717805650085e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.16216863691806793, 'actor/pg_clipfrac': 0.0024549919180572033, 'actor/ppo_kl': 0.001570685999467969}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.3201472759246826, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012871025828644633}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.30661100149154663, 'actor/pg_clipfrac': 0.0009620009805075824, 'actor/ppo_kl': 0.0011586358305066824}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.07044150680303574, 'actor/pg_clipfrac': 0.000561797758564353, 'actor/ppo_kl': -0.0005790978320874274}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3799441158771515, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008509253384545445}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.33754831552505493, 'actor/pg_clipfrac': 0.0010167767759412527, 'actor/ppo_kl': 0.00010684934386517853}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.4477347433567047, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003404869348742068}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.15727335214614868, 'actor/pg_clipfrac': 0.004102563951164484, 'actor/ppo_kl': 0.0005137732368893921}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.09281453490257263, 'actor/pg_clipfrac': 0.0012650220887735486, 'actor/ppo_kl': -0.0015612558927387}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002220940514234826, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00013836690050084144}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.4475414454936981, 'actor/pg_clipfrac': 0.001437814556993544, 'actor/ppo_kl': 0.0003637509944383055}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.28243884444236755, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007066159159876406}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00028137650224380195, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010393393458798528}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.12022534757852554, 'actor/pg_clipfrac': 0.0006426735199056566, 'actor/ppo_kl': 0.0004396843141876161}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.14521992206573486, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008454509079456329}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.9082810282707214, 'actor/pg_clipfrac': 0.0013413815759122372, 'actor/ppo_kl': -0.0001712929515633732}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.11038956791162491, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009545410284772515}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.4104185998439789, 'actor/pg_clipfrac': 0.001306335674598813, 'actor/ppo_kl': -0.0012532962718978524}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.18480327725410461, 'actor/pg_clipfrac': 0.0030832476913928986, 'actor/ppo_kl': 0.0011662564938887954}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.8521372675895691, 'actor/pg_clipfrac': 0.0024449878837913275, 'actor/ppo_kl': 0.0014757997123524547}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.39325132966041565, 'actor/pg_clipfrac': 0.0038491147570312023, 'actor/ppo_kl': -0.0006395872915163636}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.050893690437078476, 'actor/pg_clipfrac': 0.0013908206019550562, 'actor/ppo_kl': 0.0006660387152805924}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.5389861464500427, 'actor/pg_clipfrac': 0.0014154281234368682, 'actor/ppo_kl': 0.00038708109059371054}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.4337827265262604, 'actor/pg_clipfrac': 0.0006269592558965087, 'actor/ppo_kl': 0.00034343276638537645}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2694258689880371, 'actor/pg_clipfrac': 0.0009765625, 'actor/ppo_kl': 0.0014742184430360794}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.31783151626586914, 'actor/pg_clipfrac': 0.0008071024785749614, 'actor/ppo_kl': -0.00011487657320685685}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.339846670627594, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009081136668100953}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.2687716782093048, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.1406508519939962e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.14248725771903992, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008958150283433497}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.3549076020717621, 'actor/pg_clipfrac': 0.0013363028410822153, 'actor/ppo_kl': 6.947082147235051e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.15106528997421265, 'actor/pg_clipfrac': 0.0009433962404727936, 'actor/ppo_kl': 0.0017389855347573757}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.012847512029111385, 'actor/pg_clipfrac': 0.0015710919396951795, 'actor/ppo_kl': 0.0007546478300355375}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.408341646194458, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 9.119730384554714e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.11790388822555542, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009848953923210502}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.2781422734260559, 'actor/pg_clipfrac': 0.0019474197179079056, 'actor/ppo_kl': -0.00044499911018647254}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.20375464856624603, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006925432826392353}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00015158887254074216, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00015036472177598625}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0005320830387063324, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001681922294665128}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.000280300184385851, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006223960663191974}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.12967723608016968, 'actor/pg_clipfrac': 0.0008635578560642898, 'actor/ppo_kl': -0.00013376641436479986}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.673321008682251, 'actor/pg_clipfrac': 0.007559395395219326, 'actor/ppo_kl': 0.0003973546845372766}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.28018686175346375, 'actor/pg_clipfrac': 0.003597122384235263, 'actor/ppo_kl': -0.0012686498230323195}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.23322205245494843, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002873428456950933}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.000259743130300194, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -5.20969697390683e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.11971985548734665, 'actor/pg_clipfrac': 0.004464285913854837, 'actor/ppo_kl': -0.002847948344424367}
[36m(Runner pid=3309020)[0m Step 57
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.251
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.034
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.015
[36m(Runner pid=3309020)[0m ppo_kl: -4.540424574095425e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.019
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.019
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.67
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.67
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 998551
[36m(Runner pid=3309020)[0m balanced_min: 998550
[36m(Runner pid=3309020)[0m max: 1002638
[36m(Runner pid=3309020)[0m mean: 998550.5
[36m(Runner pid=3309020)[0m min: 994463
[36m(Runner pid=3309020)[0m minmax_diff: 8175
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.295
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.122
[36m(Runner pid=3309020)[0m throughput: 1161.815
[36m(Runner pid=3309020)[0m time_per_step: 859.474
[36m(Runner pid=3309020)[0m total_num_tokens: 1997101
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 814.0
[36m(Runner pid=3309020)[0m mean: 465.117
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1252.0
[36m(Runner pid=3309020)[0m mean: 315.0
[36m(Runner pid=3309020)[0m min: 50.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.341
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.67
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.81172590171536e-05
[36m(Runner pid=3309020)[0m gen: 0.133
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.01
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:20<1:25:44, 4.04s/it, est. speed input: 130.11 toks/s, output: 24.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:20<36:30, 1.72s/it, est. speed input: 238.26 toks/s, output: 49.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:21<20:37, 1.02it/s, est. speed input: 341.92 toks/s, output: 72.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<25:08, 1.20s/it, est. speed input: 328.73 toks/s, output: 81.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:29<17:30, 1.19it/s, est. speed input: 391.58 toks/s, output: 105.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<11:50, 1.76it/s, est. speed input: 462.67 toks/s, output: 129.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:30<08:57, 2.32it/s, est. speed input: 518.43 toks/s, output: 150.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<07:22, 2.80it/s, est. speed input: 576.16 toks/s, output: 167.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<05:49, 3.53it/s, est. speed input: 637.42 toks/s, output: 189.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:32<04:29, 4.56it/s, est. speed input: 701.17 toks/s, output: 211.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:32<03:22, 6.04it/s, est. speed input: 766.81 toks/s, output: 234.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<04:55, 4.12it/s, est. speed input: 784.24 toks/s, output: 244.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<03:57, 5.12it/s, est. speed input: 841.56 toks/s, output: 265.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<02:56, 6.85it/s, est. speed input: 900.29 toks/s, output: 284.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:59, 6.72it/s, est. speed input: 941.13 toks/s, output: 303.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:38<03:59, 5.00it/s, est. speed input: 962.85 toks/s, output: 317.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:38<02:57, 6.72it/s, est. speed input: 1020.73 toks/s, output: 339.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:38<01:42, 11.52it/s, est. speed input: 1146.31 toks/s, output: 381.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<00:56, 20.66it/s, est. speed input: 1328.05 toks/s, output: 453.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:51, 22.59it/s, est. speed input: 1441.01 toks/s, output: 498.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:39<00:50, 22.79it/s, est. speed input: 1494.16 toks/s, output: 521.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:39<00:53, 21.56it/s, est. speed input: 1542.56 toks/s, output: 540.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:39<00:38, 29.24it/s, est. speed input: 1653.76 toks/s, output: 593.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:39<00:33, 33.88it/s, est. speed input: 1759.00 toks/s, output: 637.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:25, 43.96it/s, est. speed input: 1925.32 toks/s, output: 710.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:40<00:32, 33.52it/s, est. speed input: 2017.76 toks/s, output: 752.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:40<00:35, 30.50it/s, est. speed input: 2110.86 toks/s, output: 791.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:41<00:46, 23.11it/s, est. speed input: 2188.36 toks/s, output: 836.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:41<00:45, 23.82it/s, est. speed input: 2228.92 toks/s, output: 858.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:42<00:53, 19.99it/s, est. speed input: 2261.07 toks/s, output: 876.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:42<00:34, 30.88it/s, est. speed input: 2421.54 toks/s, output: 963.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:42<00:32, 32.53it/s, est. speed input: 2471.84 toks/s, output: 986.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:42<00:29, 35.60it/s, est. speed input: 2564.73 toks/s, output: 1019.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:42<00:32, 31.71it/s, est. speed input: 2606.53 toks/s, output: 1043.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:42<00:25, 41.10it/s, est. speed input: 2701.36 toks/s, output: 1099.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:43<00:24, 42.24it/s, est. speed input: 2822.99 toks/s, output: 1156.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:43<00:25, 39.41it/s, est. speed input: 2867.18 toks/s, output: 1178.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:43<00:16, 61.75it/s, est. speed input: 3066.63 toks/s, output: 1276.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:43<00:17, 57.34it/s, est. speed input: 3159.85 toks/s, output: 1325.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:43<00:20, 47.06it/s, est. speed input: 3241.07 toks/s, output: 1364.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:44<00:20, 47.63it/s, est. speed input: 3328.47 toks/s, output: 1421.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:44<00:18, 51.92it/s, est. speed input: 3417.15 toks/s, output: 1477.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:44<00:17, 54.98it/s, est. speed input: 3551.50 toks/s, output: 1553.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:44<00:15, 61.14it/s, est. speed input: 3695.88 toks/s, output: 1637.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:44<00:12, 70.98it/s, est. speed input: 3840.81 toks/s, output: 1716.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:45<00:13, 65.91it/s, est. speed input: 3928.92 toks/s, output: 1768.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:45<00:12, 71.53it/s, est. speed input: 4070.69 toks/s, output: 1835.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:45<00:12, 67.58it/s, est. speed input: 4156.98 toks/s, output: 1885.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:45<00:15, 57.13it/s, est. speed input: 4243.58 toks/s, output: 1930.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:45<00:15, 54.13it/s, est. speed input: 4324.63 toks/s, output: 1987.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:46<00:12, 68.02it/s, est. speed input: 4468.24 toks/s, output: 2076.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:46<00:12, 66.44it/s, est. speed input: 4551.23 toks/s, output: 2132.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:46<00:14, 57.31it/s, est. speed input: 4622.23 toks/s, output: 2178.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:46<00:11, 70.48it/s, est. speed input: 4801.00 toks/s, output: 2284.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:46<00:11, 68.86it/s, est. speed input: 4884.73 toks/s, output: 2334.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:46<00:10, 72.87it/s, est. speed input: 4975.68 toks/s, output: 2386.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:47<00:11, 66.63it/s, est. speed input: 5061.00 toks/s, output: 2429.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:47<00:09, 82.70it/s, est. speed input: 5196.15 toks/s, output: 2507.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:47<00:10, 70.19it/s, est. speed input: 5268.34 toks/s, output: 2544.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:47<00:07, 100.05it/s, est. speed input: 5497.20 toks/s, output: 2678.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:47<00:07, 95.31it/s, est. speed input: 5666.45 toks/s, output: 2794.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:47<00:07, 86.13it/s, est. speed input: 5783.05 toks/s, output: 2868.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:48<00:09, 73.03it/s, est. speed input: 5857.62 toks/s, output: 2917.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:48<00:06, 95.82it/s, est. speed input: 6074.37 toks/s, output: 3068.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:48<00:05, 106.17it/s, est. speed input: 6245.37 toks/s, output: 3175.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:48<00:05, 114.87it/s, est. speed input: 6431.88 toks/s, output: 3309.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:48<00:05, 113.07it/s, est. speed input: 6550.57 toks/s, output: 3409.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:48<00:04, 125.80it/s, est. speed input: 6721.41 toks/s, output: 3532.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:49<00:06, 92.26it/s, est. speed input: 6820.13 toks/s, output: 3595.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:49<00:05, 92.16it/s, est. speed input: 6936.92 toks/s, output: 3681.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:49<00:05, 96.68it/s, est. speed input: 7061.26 toks/s, output: 3767.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:49<00:03, 135.39it/s, est. speed input: 7325.95 toks/s, output: 3944.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:49<00:03, 140.91it/s, est. speed input: 7497.78 toks/s, output: 4054.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:50<00:04, 99.60it/s, est. speed input: 7632.43 toks/s, output: 4131.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:50<00:04, 105.08it/s, est. speed input: 7753.90 toks/s, output: 4198.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:50<00:04, 102.58it/s, est. speed input: 7863.06 toks/s, output: 4286.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:50<00:04, 91.35it/s, est. speed input: 7970.19 toks/s, output: 4376.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:50<00:04, 83.23it/s, est. speed input: 8074.43 toks/s, output: 4460.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:50<00:03, 103.07it/s, est. speed input: 8313.79 toks/s, output: 4650.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:51<00:02, 116.85it/s, est. speed input: 8471.28 toks/s, output: 4788.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:51<00:02, 114.34it/s, est. speed input: 8588.70 toks/s, output: 4867.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:51<00:02, 122.72it/s, est. speed input: 8745.73 toks/s, output: 5013.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:51<00:02, 119.69it/s, est. speed input: 8851.85 toks/s, output: 5118.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:51<00:02, 108.56it/s, est. speed input: 8952.72 toks/s, output: 5195.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:51<00:02, 102.00it/s, est. speed input: 9056.00 toks/s, output: 5290.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:51<00:02, 113.34it/s, est. speed input: 9208.72 toks/s, output: 5430.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:52<00:01, 119.08it/s, est. speed input: 9321.24 toks/s, output: 5549.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:52<00:01, 156.02it/s, est. speed input: 9559.23 toks/s, output: 5742.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:52<00:01, 163.23it/s, est. speed input: 9762.10 toks/s, output: 5941.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:52<00:01, 148.33it/s, est. speed input: 9907.63 toks/s, output: 6066.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:52<00:01, 84.30it/s, est. speed input: 9999.83 toks/s, output: 6156.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:53<00:01, 86.47it/s, est. speed input: 10145.08 toks/s, output: 6287.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:53<00:01, 91.27it/s, est. speed input: 10244.34 toks/s, output: 6396.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:53<00:01, 74.80it/s, est. speed input: 10313.13 toks/s, output: 6472.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:53<00:01, 64.67it/s, est. speed input: 10367.85 toks/s, output: 6546.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:54<00:01, 66.08it/s, est. speed input: 10427.80 toks/s, output: 6620.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:54<00:01, 59.47it/s, est. speed input: 10486.91 toks/s, output: 6683.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:54<00:00, 69.47it/s, est. speed input: 10583.26 toks/s, output: 6809.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:54<00:00, 65.50it/s, est. speed input: 10635.59 toks/s, output: 6885.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:54<00:00, 63.76it/s, est. speed input: 10690.09 toks/s, output: 6954.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:55<00:00, 28.40it/s, est. speed input: 10602.07 toks/s, output: 6948.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:56<00:00, 27.71it/s, est. speed input: 10613.30 toks/s, output: 6995.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 15.10it/s, est. speed input: 10447.71 toks/s, output: 6909.93 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 22.41it/s, est. speed input: 10447.71 toks/s, output: 6909.93 toks/s]
[36m(Runner pid=3309020)[0m update_actor: 0.283
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.196
[36m(Runner pid=3309020)[0m gen: 106.917
[36m(Runner pid=3309020)[0m old: 88.729
[36m(Runner pid=3309020)[0m ref: 90.986
[36m(Runner pid=3309020)[0m reward: 7.688
[36m(Runner pid=3309020)[0m step: 859.474
[36m(Runner pid=3309020)[0m update_actor: 564.348
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 58; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:44:23 [executor_base.py:219] It took 0.338889 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.99 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:45:51 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:44:23 [executor_base.py:219] It took 0.340109 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:45:51 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:45:51 [executor_base.py:208] It took 0.325497 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:46:15 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:46:16 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:46:16 [executor_base.py:208] It took 0.327582 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.19137296080589294, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.5279964208602905, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.7320663332939148, 'actor/pg_clipfrac': 0.0006468305364251137, 'actor/ppo_kl': -0.0011874614283442497}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00011881399404956028, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -3.5276866583444644e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002256772859254852, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00037754609365947545}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.16673599183559418, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0004582582332659513, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010555387707427144}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0003921653551515192, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 2.6626468752510846e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.49652528762817383, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016938053304329515}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002510933845769614, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.000346146960509941, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.1402367502450943, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.19321243464946747, 'actor/pg_clipfrac': 0.0008326394599862397, 'actor/ppo_kl': -0.001344709424301982}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.5136128067970276, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00040717722731642425}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6322301030158997, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.12979409098625183, 'actor/pg_clipfrac': 0.0006747638108208776, 'actor/ppo_kl': 8.935709774959832e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.7001090049743652, 'actor/pg_clipfrac': 0.0035158211831003428, 'actor/ppo_kl': -0.001964838244020939}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0005238710436969995, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005203561740927398}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.13688068091869354, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002000167005462572}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.06543660163879395, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003864351019728929}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5486962795257568, 'actor/pg_clipfrac': 0.0005089058540761471, 'actor/ppo_kl': -0.0011296900920569897}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4527733623981476, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008521407144144177}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.1598135232925415, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007877897587604821}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.16318441927433014, 'actor/pg_clipfrac': 0.0021621622145175934, 'actor/ppo_kl': -0.001473516458645463}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.43171411752700806, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00016006188525352627}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00032214971724897623, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00034806635812856257}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00035761561593972147, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002718726173043251}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1265878975391388, 'actor/pg_clipfrac': 0.0024009603075683117, 'actor/ppo_kl': -0.0013507542898878455}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.19937600195407867, 'actor/pg_clipfrac': 0.0010090817231684923, 'actor/ppo_kl': -0.00027336482889950275}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.19507759809494019, 'actor/pg_clipfrac': 0.0007587253348901868, 'actor/ppo_kl': 0.0004078600322827697}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.21315811574459076, 'actor/pg_clipfrac': 0.0007027406827546656, 'actor/ppo_kl': 0.0001393825514242053}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.33697739243507385, 'actor/pg_clipfrac': 0.00699300691485405, 'actor/ppo_kl': 0.00029324617935344577}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002279237232869491, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001331761828623712}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.036069802939891815, 'actor/pg_clipfrac': 0.0017079418757930398, 'actor/ppo_kl': -0.0006776006775908172}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.008037704974412918, 'actor/pg_clipfrac': 0.0006635700119659305, 'actor/ppo_kl': 0.0002450607717037201}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00030335463816300035, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013450811384245753}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0001943889947142452, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000947505293879658}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.07425577193498611, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001296426635235548}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.29033324122428894, 'actor/pg_clipfrac': 0.002044989727437496, 'actor/ppo_kl': 0.0018614240689203143}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0004155373608227819, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009852189105004072}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0002892735064961016, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005055865622125566}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.09872884303331375, 'actor/pg_clipfrac': 0.0025575447361916304, 'actor/ppo_kl': 0.000338908052071929}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.17025651037693024, 'actor/pg_clipfrac': 0.0036523009184747934, 'actor/ppo_kl': -0.0011377173941582441}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0002361228980589658, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006487166974693537}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.26285743713378906, 'actor/pg_clipfrac': 0.0011111111380159855, 'actor/ppo_kl': 0.0014018970541656017}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.23181767761707306, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007380764000117779}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.008475028909742832, 'actor/pg_clipfrac': 0.000700770819094032, 'actor/ppo_kl': -0.0005710136611014605}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4128705859184265, 'actor/pg_clipfrac': 0.0008826125413179398, 'actor/ppo_kl': 0.0004808471421711147}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.7071478366851807, 'actor/pg_clipfrac': 0.00313152396120131, 'actor/ppo_kl': 0.0016447178786620498}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0005831752787344158, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0020189224742352962}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2096197009086609, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025146707776002586}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.12727177143096924, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018451742362231016}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.549338698387146, 'actor/pg_clipfrac': 0.006353239994496107, 'actor/ppo_kl': -0.000736377143766731}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.38920801877975464, 'actor/pg_clipfrac': 0.0020140986889600754, 'actor/ppo_kl': -0.0007083831587806344}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00027305507683195174, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013015454169362783}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0003965801151935011, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006586145027540624}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2300471067428589, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007410335238091648}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.047452185302972794, 'actor/pg_clipfrac': 0.0016438355669379234, 'actor/ppo_kl': 0.00011653011461021379}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2821866571903229, 'actor/pg_clipfrac': 0.0015588464448228478, 'actor/ppo_kl': 0.0016310610808432102}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.6077563762664795, 'actor/pg_clipfrac': 0.002344665816053748, 'actor/ppo_kl': 0.0003195938188582659}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0958617702126503, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013197977095842361}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.6184820532798767, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005846476997248828}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.28360477089881897, 'actor/pg_clipfrac': 0.002030456904321909, 'actor/ppo_kl': 0.0005548564367927611}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.21957072615623474, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010604917770251632}
[36m(Runner pid=3309020)[0m Step 58
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.238
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.039
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.019
[36m(Runner pid=3309020)[0m ppo_kl: 3.067049040588188e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.042
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.042
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.652
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.652
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 999554
[36m(Runner pid=3309020)[0m balanced_min: 999045
[36m(Runner pid=3309020)[0m max: 1007385
[36m(Runner pid=3309020)[0m mean: 999299.5
[36m(Runner pid=3309020)[0m min: 991214
[36m(Runner pid=3309020)[0m minmax_diff: 16171
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.008
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:17<1:13:39, 3.47s/it, est. speed input: 134.43 toks/s, output: 25.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:18<32:05, 1.52s/it, est. speed input: 253.49 toks/s, output: 49.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<31:13, 1.48s/it, est. speed input: 274.48 toks/s, output: 60.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<24:12, 1.15s/it, est. speed input: 320.37 toks/s, output: 79.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:29<17:05, 1.22it/s, est. speed input: 387.42 toks/s, output: 101.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<12:05, 1.72it/s, est. speed input: 457.86 toks/s, output: 122.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<09:51, 2.10it/s, est. speed input: 515.31 toks/s, output: 139.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<08:43, 2.37it/s, est. speed input: 560.91 toks/s, output: 153.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<06:49, 3.02it/s, est. speed input: 618.59 toks/s, output: 175.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<05:01, 4.08it/s, est. speed input: 682.52 toks/s, output: 198.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<04:14, 4.82it/s, est. speed input: 736.47 toks/s, output: 219.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<04:01, 5.04it/s, est. speed input: 783.74 toks/s, output: 237.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<03:22, 6.00it/s, est. speed input: 838.42 toks/s, output: 256.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:09, 9.31it/s, est. speed input: 957.30 toks/s, output: 296.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<02:18, 8.69it/s, est. speed input: 1000.33 toks/s, output: 313.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<01:31, 13.01it/s, est. speed input: 1124.35 toks/s, output: 357.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:37<01:18, 15.09it/s, est. speed input: 1179.33 toks/s, output: 377.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:37<01:04, 18.20it/s, est. speed input: 1293.46 toks/s, output: 419.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:37<00:47, 24.57it/s, est. speed input: 1410.13 toks/s, output: 464.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:31, 36.74it/s, est. speed input: 1579.03 toks/s, output: 535.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:36, 31.26it/s, est. speed input: 1679.78 toks/s, output: 578.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:28, 39.82it/s, est. speed input: 1850.12 toks/s, output: 644.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:39, 28.42it/s, est. speed input: 1941.57 toks/s, output: 685.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:31, 34.57it/s, est. speed input: 2103.42 toks/s, output: 761.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:31, 34.93it/s, est. speed input: 2156.24 toks/s, output: 781.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:33, 32.63it/s, est. speed input: 2199.42 toks/s, output: 801.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:35, 30.80it/s, est. speed input: 2246.63 toks/s, output: 821.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:40<00:39, 27.05it/s, est. speed input: 2289.22 toks/s, output: 844.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:31, 33.80it/s, est. speed input: 2389.72 toks/s, output: 897.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:26, 39.39it/s, est. speed input: 2486.35 toks/s, output: 940.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:41<00:36, 28.86it/s, est. speed input: 2520.18 toks/s, output: 961.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:41<00:27, 38.16it/s, est. speed input: 2624.48 toks/s, output: 1017.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:41<00:26, 39.02it/s, est. speed input: 2667.78 toks/s, output: 1039.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:41<00:31, 32.85it/s, est. speed input: 2706.99 toks/s, output: 1063.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:41<00:29, 35.00it/s, est. speed input: 2752.99 toks/s, output: 1089.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:41<00:33, 30.21it/s, est. speed input: 2789.72 toks/s, output: 1114.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:42<00:49, 20.55it/s, est. speed input: 2816.19 toks/s, output: 1127.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:42<00:32, 30.87it/s, est. speed input: 2911.12 toks/s, output: 1187.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:42<00:30, 33.35it/s, est. speed input: 2955.09 toks/s, output: 1211.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:42<00:30, 32.66it/s, est. speed input: 2999.36 toks/s, output: 1231.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:43<00:41, 23.84it/s, est. speed input: 3027.07 toks/s, output: 1241.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:43<00:41, 23.84it/s, est. speed input: 3068.08 toks/s, output: 1261.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:43<00:22, 43.53it/s, est. speed input: 3217.35 toks/s, output: 1350.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:43<00:21, 45.04it/s, est. speed input: 3312.27 toks/s, output: 1401.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:43<00:20, 46.22it/s, est. speed input: 3412.28 toks/s, output: 1450.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:44<00:21, 43.94it/s, est. speed input: 3497.00 toks/s, output: 1498.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:44<00:18, 49.30it/s, est. speed input: 3587.96 toks/s, output: 1554.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:44<00:21, 43.27it/s, est. speed input: 3672.68 toks/s, output: 1592.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:44<00:23, 38.56it/s, est. speed input: 3708.92 toks/s, output: 1620.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:44<00:26, 35.04it/s, est. speed input: 3746.33 toks/s, output: 1639.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:45<00:17, 51.58it/s, est. speed input: 3894.92 toks/s, output: 1720.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:45<00:10, 84.50it/s, est. speed input: 4280.48 toks/s, output: 1931.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:45<00:11, 73.74it/s, est. speed input: 4361.47 toks/s, output: 1977.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:45<00:12, 65.95it/s, est. speed input: 4442.28 toks/s, output: 2026.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:45<00:12, 68.78it/s, est. speed input: 4530.19 toks/s, output: 2071.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:46<00:08, 91.81it/s, est. speed input: 4761.27 toks/s, output: 2206.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:46<00:09, 83.71it/s, est. speed input: 4844.89 toks/s, output: 2262.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:46<00:11, 68.44it/s, est. speed input: 4919.06 toks/s, output: 2303.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:46<00:11, 67.03it/s, est. speed input: 4998.28 toks/s, output: 2353.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:46<00:08, 89.50it/s, est. speed input: 5184.42 toks/s, output: 2461.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:47<00:13, 56.81it/s, est. speed input: 5280.14 toks/s, output: 2527.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:47<00:08, 84.04it/s, est. speed input: 5521.66 toks/s, output: 2687.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:47<00:08, 85.97it/s, est. speed input: 5654.60 toks/s, output: 2769.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:47<00:07, 94.24it/s, est. speed input: 5876.91 toks/s, output: 2895.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:47<00:06, 99.54it/s, est. speed input: 6009.75 toks/s, output: 2967.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:48<00:06, 93.89it/s, est. speed input: 6135.47 toks/s, output: 3062.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:48<00:08, 75.71it/s, est. speed input: 6239.13 toks/s, output: 3141.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:48<00:09, 65.28it/s, est. speed input: 6305.91 toks/s, output: 3179.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:48<00:09, 65.42it/s, est. speed input: 6412.05 toks/s, output: 3259.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:49<00:10, 59.33it/s, est. speed input: 6479.18 toks/s, output: 3319.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:49<00:08, 71.49it/s, est. speed input: 6607.23 toks/s, output: 3395.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:49<00:06, 84.83it/s, est. speed input: 6773.95 toks/s, output: 3526.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:49<00:04, 112.80it/s, est. speed input: 7112.28 toks/s, output: 3691.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:49<00:04, 121.39it/s, est. speed input: 7359.75 toks/s, output: 3879.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:49<00:03, 121.83it/s, est. speed input: 7476.87 toks/s, output: 3960.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:50<00:04, 109.88it/s, est. speed input: 7597.14 toks/s, output: 4048.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:50<00:03, 137.63it/s, est. speed input: 7811.51 toks/s, output: 4219.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:50<00:03, 134.47it/s, est. speed input: 7972.01 toks/s, output: 4314.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:50<00:03, 101.96it/s, est. speed input: 8108.37 toks/s, output: 4420.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:50<00:04, 93.05it/s, est. speed input: 8214.85 toks/s, output: 4526.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:50<00:02, 129.07it/s, est. speed input: 8482.88 toks/s, output: 4750.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:51<00:02, 122.75it/s, est. speed input: 8629.21 toks/s, output: 4867.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:51<00:02, 117.63it/s, est. speed input: 8745.63 toks/s, output: 4957.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:51<00:02, 114.96it/s, est. speed input: 8853.34 toks/s, output: 5036.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:51<00:01, 157.27it/s, est. speed input: 9140.04 toks/s, output: 5283.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:51<00:01, 165.17it/s, est. speed input: 9336.32 toks/s, output: 5424.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:51<00:01, 136.06it/s, est. speed input: 9471.04 toks/s, output: 5532.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:52<00:01, 117.39it/s, est. speed input: 9620.60 toks/s, output: 5680.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:52<00:02, 86.87it/s, est. speed input: 9689.58 toks/s, output: 5758.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:52<00:02, 81.94it/s, est. speed input: 9779.76 toks/s, output: 5871.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1115/1280 [00:52<00:01, 90.70it/s, est. speed input: 9893.90 toks/s, output: 5970.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:52<00:01, 105.92it/s, est. speed input: 10038.59 toks/s, output: 6112.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:53<00:01, 93.92it/s, est. speed input: 10129.74 toks/s, output: 6218.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:53<00:01, 94.47it/s, est. speed input: 10231.47 toks/s, output: 6332.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:53<00:00, 101.00it/s, est. speed input: 10330.33 toks/s, output: 6472.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:53<00:01, 78.19it/s, est. speed input: 10407.17 toks/s, output: 6569.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:54<00:01, 54.59it/s, est. speed input: 10447.52 toks/s, output: 6635.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:54<00:01, 52.04it/s, est. speed input: 10486.85 toks/s, output: 6684.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:54<00:00, 52.09it/s, est. speed input: 10565.92 toks/s, output: 6815.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:54<00:00, 47.44it/s, est. speed input: 10595.49 toks/s, output: 6871.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:55<00:00, 42.91it/s, est. speed input: 10663.08 toks/s, output: 6958.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:55<00:00, 46.15it/s, est. speed input: 10721.45 toks/s, output: 7029.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.50it/s, est. speed input: 10525.96 toks/s, output: 6939.04 toks/s]
[36m(Runner pid=3309020)[0m mfu_actor: 0.123
[36m(Runner pid=3309020)[0m throughput: 1142.392
[36m(Runner pid=3309020)[0m time_per_step: 874.743
[36m(Runner pid=3309020)[0m total_num_tokens: 1998599
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 807.0
[36m(Runner pid=3309020)[0m mean: 466.037
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 4127.0
[36m(Runner pid=3309020)[0m mean: 314.666
[36m(Runner pid=3309020)[0m min: 55.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.307
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.652
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.455114758054347e-05
[36m(Runner pid=3309020)[0m gen: 0.159
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.282
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.189
[36m(Runner pid=3309020)[0m gen: 127.964
[36m(Runner pid=3309020)[0m old: 88.46
[36m(Runner pid=3309020)[0m ref: 87.827
[36m(Runner pid=3309020)[0m reward: 6.477
[36m(Runner pid=3309020)[0m step: 874.743
[36m(Runner pid=3309020)[0m update_actor: 562.76
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 59; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.66 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 11:58:58 [executor_base.py:219] It took 0.338636 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.58 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.77 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 11:58:58 [executor_base.py:219] It took 0.340941 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:00:26 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:00:26 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.85 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:00:26 [executor_base.py:208] It took 0.325191 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.85 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:00:29 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:00:29 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:00:29 [executor_base.py:208] It took 0.326052 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0005338925402611494, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018173804273828864}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0003036411653738469, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0005200927844271064, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00044643014553003013, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0005707116797566414, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.19027145206928253, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00041786619112826884}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.058307915925979614, 'actor/pg_clipfrac': 0.002193944761529565, 'actor/ppo_kl': -8.891810284694657e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0005810889997519553, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0027602468617260456}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.3090845048427582, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.21629290282726288, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.03053983300924301, 'actor/pg_clipfrac': 0.000561797758564353, 'actor/ppo_kl': 0.00032338561140932143}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.1805630475282669, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00038023616070859134, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.3506169021129608, 'actor/pg_clipfrac': 0.0009276437922380865, 'actor/ppo_kl': -0.00018460763385519385}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00023937640071380883, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.6142869591712952, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.14875105023384094, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003963666968047619}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002824912662617862, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006872498779557645}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0004337565042078495, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008590146899223328}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.012230937369167805, 'actor/pg_clipfrac': 0.0026631157379597425, 'actor/ppo_kl': -0.0009473639074712992}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.23341982066631317, 'actor/pg_clipfrac': 0.0031347961630672216, 'actor/ppo_kl': -5.303356010699645e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003309190215077251, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010623183334246278}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00044042590889148414, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001372465630993247}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.5938219428062439, 'actor/pg_clipfrac': 0.0025510203558951616, 'actor/ppo_kl': -0.0003860888828057796}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.032506994903087616, 'actor/pg_clipfrac': 0.00162601622287184, 'actor/ppo_kl': -0.0010276887333020568}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00029090416501276195, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00102906278334558}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00023552391212433577, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00020217022392898798}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.35831862688064575, 'actor/pg_clipfrac': 0.0035211266949772835, 'actor/ppo_kl': 0.0005902790580876172}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.16932056844234467, 'actor/pg_clipfrac': 0.0012453299714252353, 'actor/ppo_kl': 0.0011018940713256598}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.23945870995521545, 'actor/pg_clipfrac': 0.0008312552236020565, 'actor/ppo_kl': -0.0004896558821201324}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2517252266407013, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013844394125044346}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.12133543938398361, 'actor/pg_clipfrac': 0.0012562813935801387, 'actor/ppo_kl': -0.00083345384337008}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.01547827385365963, 'actor/pg_clipfrac': 0.0017953321803361177, 'actor/ppo_kl': 0.00046283850679174066}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.23113708198070526, 'actor/pg_clipfrac': 0.0005892752087675035, 'actor/ppo_kl': 0.0007995544583536685}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.1319480985403061, 'actor/pg_clipfrac': 0.0007342143799178302, 'actor/ppo_kl': -0.00023158741532824934}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.20561645925045013, 'actor/pg_clipfrac': 0.0007012622663751245, 'actor/ppo_kl': -0.0012437625555321574}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.7231798768043518, 'actor/pg_clipfrac': 0.002074688905850053, 'actor/ppo_kl': 0.00013029921683482826}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00029142049606889486, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008928341558203101}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.09589444100856781, 'actor/pg_clipfrac': 0.0015015015378594398, 'actor/ppo_kl': 0.00037748320028185844}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3544237017631531, 'actor/pg_clipfrac': 0.0024958401918411255, 'actor/ppo_kl': -0.00036436389200389385}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.11515823751688004, 'actor/pg_clipfrac': 0.0023980815894901752, 'actor/ppo_kl': 0.002924871165305376}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.4019589424133301, 'actor/pg_clipfrac': 0.0027149321977049112, 'actor/ppo_kl': -0.0007994172628968954}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.6087930202484131, 'actor/pg_clipfrac': 0.0008025682182051241, 'actor/ppo_kl': 0.0002737014729063958}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00034630022128112614, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002364847023272887}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.16018588840961456, 'actor/pg_clipfrac': 0.0011185682378709316, 'actor/ppo_kl': -0.0003659015928860754}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.39482736587524414, 'actor/pg_clipfrac': 0.0007968127611093223, 'actor/ppo_kl': 0.00011656103743007407}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.2824491262435913, 'actor/pg_clipfrac': 0.0010282776784151793, 'actor/ppo_kl': -0.0008160740835592151}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3854185938835144, 'actor/pg_clipfrac': 0.0004906771355308592, 'actor/ppo_kl': 0.0001995930797420442}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.424238383769989, 'actor/pg_clipfrac': 0.0010219723917543888, 'actor/ppo_kl': -0.00041245121974498034}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.21862483024597168, 'actor/pg_clipfrac': 0.0015197568573057652, 'actor/ppo_kl': 0.0004319332947488874}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.340057909488678, 'actor/pg_clipfrac': 0.0013837638543918729, 'actor/ppo_kl': 0.0002798919740598649}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.1282138228416443, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008913243073038757}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.8891749978065491, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -2.859163942048326e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003951415710616857, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001599490875378251}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0004041238280478865, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007774583646096289}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.3264767825603485, 'actor/pg_clipfrac': 0.0006825938471592963, 'actor/ppo_kl': 8.159155549947172e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.16536159813404083, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005630773957818747}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.5801229476928711, 'actor/pg_clipfrac': 0.0009009009227156639, 'actor/ppo_kl': -0.0010638039093464613}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.5084941983222961, 'actor/pg_clipfrac': 0.002659574383869767, 'actor/ppo_kl': 0.0008447795989923179}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.4441571533679962, 'actor/pg_clipfrac': 0.001401541638188064, 'actor/ppo_kl': -0.00081933120964095}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.04013831540942192, 'actor/pg_clipfrac': 0.0007651109481230378, 'actor/ppo_kl': -0.000441729964222759}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.1974063366651535, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00062569446163252}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2056698501110077, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025901285698637366}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0004417157906573266, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00035067362478002906}
[36m(Runner pid=3309020)[0m Step 59
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.251
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.021
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.001
[36m(Runner pid=3309020)[0m ppo_kl: -2.7184350311060256e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:19<1:22:02, 3.86s/it, est. speed input: 118.38 toks/s, output: 29.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<50:02, 2.36s/it, est. speed input: 177.69 toks/s, output: 47.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:30<35:44, 1.70s/it, est. speed input: 227.12 toks/s, output: 63.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:31<23:43, 1.13s/it, est. speed input: 289.27 toks/s, output: 84.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<15:29, 1.35it/s, est. speed input: 358.46 toks/s, output: 109.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:33<12:24, 1.68it/s, est. speed input: 410.03 toks/s, output: 126.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<06:33, 3.15it/s, est. speed input: 546.73 toks/s, output: 176.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:34<04:23, 4.67it/s, est. speed input: 667.71 toks/s, output: 219.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<03:44, 5.47it/s, est. speed input: 724.92 toks/s, output: 242.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<03:11, 6.36it/s, est. speed input: 783.67 toks/s, output: 265.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:35<02:50, 7.14it/s, est. speed input: 837.75 toks/s, output: 283.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:35<02:18, 8.74it/s, est. speed input: 896.97 toks/s, output: 297.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:36<01:29, 13.45it/s, est. speed input: 1020.96 toks/s, output: 345.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:36<01:37, 12.26it/s, est. speed input: 1066.20 toks/s, output: 363.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:37<01:35, 12.35it/s, est. speed input: 1162.99 toks/s, output: 401.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<01:25, 13.82it/s, est. speed input: 1215.65 toks/s, output: 423.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:37<00:51, 22.84it/s, est. speed input: 1393.83 toks/s, output: 492.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:47, 24.55it/s, est. speed input: 1446.80 toks/s, output: 511.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<00:32, 34.94it/s, est. speed input: 1617.98 toks/s, output: 586.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:36, 31.16it/s, est. speed input: 1715.15 toks/s, output: 629.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:34, 32.79it/s, est. speed input: 1820.44 toks/s, output: 676.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:32, 34.09it/s, est. speed input: 1919.89 toks/s, output: 727.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:39<00:27, 40.44it/s, est. speed input: 2082.74 toks/s, output: 799.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:33, 33.04it/s, est. speed input: 2123.71 toks/s, output: 822.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:39<00:26, 40.65it/s, est. speed input: 2233.00 toks/s, output: 867.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:30, 35.71it/s, est. speed input: 2326.10 toks/s, output: 903.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:42, 25.18it/s, est. speed input: 2356.95 toks/s, output: 917.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:38, 27.53it/s, est. speed input: 2402.43 toks/s, output: 943.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:41<00:40, 26.06it/s, est. speed input: 2447.11 toks/s, output: 959.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:41<00:39, 26.81it/s, est. speed input: 2488.69 toks/s, output: 987.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:41<00:38, 27.47it/s, est. speed input: 2531.65 toks/s, output: 1010.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:41<00:34, 30.66it/s, est. speed input: 2575.54 toks/s, output: 1034.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:41<00:23, 43.48it/s, est. speed input: 2727.86 toks/s, output: 1103.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:17, 56.72it/s, est. speed input: 2883.61 toks/s, output: 1188.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:42<00:15, 64.09it/s, est. speed input: 2988.05 toks/s, output: 1239.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:42<00:17, 56.96it/s, est. speed input: 3129.33 toks/s, output: 1319.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:42<00:27, 35.94it/s, est. speed input: 3195.54 toks/s, output: 1364.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:22, 43.70it/s, est. speed input: 3292.59 toks/s, output: 1417.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:43<00:23, 40.49it/s, est. speed input: 3380.55 toks/s, output: 1464.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:43<00:22, 43.16it/s, est. speed input: 3473.61 toks/s, output: 1507.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:43<00:15, 58.91it/s, est. speed input: 3700.15 toks/s, output: 1625.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:43<00:14, 62.38it/s, est. speed input: 3792.44 toks/s, output: 1677.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:43<00:13, 68.92it/s, est. speed input: 3945.90 toks/s, output: 1770.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:44<00:12, 69.07it/s, est. speed input: 4086.20 toks/s, output: 1857.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:44<00:09, 91.94it/s, est. speed input: 4370.35 toks/s, output: 2012.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:44<00:11, 72.31it/s, est. speed input: 4495.24 toks/s, output: 2085.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:09, 83.40it/s, est. speed input: 4687.96 toks/s, output: 2207.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:45<00:08, 91.12it/s, est. speed input: 4833.86 toks/s, output: 2281.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:45<00:07, 97.77it/s, est. speed input: 5125.54 toks/s, output: 2446.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:45<00:06, 111.08it/s, est. speed input: 5317.54 toks/s, output: 2565.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:45<00:07, 95.36it/s, est. speed input: 5441.83 toks/s, output: 2655.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:45<00:06, 112.79it/s, est. speed input: 5634.95 toks/s, output: 2754.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:45<00:06, 105.38it/s, est. speed input: 5771.85 toks/s, output: 2832.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:46<00:06, 106.67it/s, est. speed input: 5904.48 toks/s, output: 2900.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:46<00:07, 86.74it/s, est. speed input: 6015.74 toks/s, output: 2976.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:46<00:07, 84.84it/s, est. speed input: 6146.03 toks/s, output: 3052.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:46<00:08, 79.86it/s, est. speed input: 6220.89 toks/s, output: 3104.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:46<00:07, 90.05it/s, est. speed input: 6353.62 toks/s, output: 3187.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:46<00:06, 98.69it/s, est. speed input: 6482.10 toks/s, output: 3274.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:46<00:05, 106.09it/s, est. speed input: 6611.26 toks/s, output: 3355.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:47<00:04, 123.56it/s, est. speed input: 6791.12 toks/s, output: 3470.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:47<00:04, 125.60it/s, est. speed input: 6927.02 toks/s, output: 3546.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:47<00:06, 88.77it/s, est. speed input: 7030.45 toks/s, output: 3615.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:47<00:05, 106.15it/s, est. speed input: 7211.89 toks/s, output: 3743.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:47<00:04, 110.76it/s, est. speed input: 7381.44 toks/s, output: 3850.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:48<00:05, 96.49it/s, est. speed input: 7490.81 toks/s, output: 3932.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:48<00:05, 85.67it/s, est. speed input: 7596.63 toks/s, output: 4011.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:48<00:05, 92.64it/s, est. speed input: 7726.26 toks/s, output: 4098.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:48<00:05, 90.88it/s, est. speed input: 7836.88 toks/s, output: 4184.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:48<00:04, 98.12it/s, est. speed input: 7964.28 toks/s, output: 4283.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:48<00:03, 119.86it/s, est. speed input: 8133.21 toks/s, output: 4397.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 885/1280 [00:49<00:03, 122.73it/s, est. speed input: 8392.58 toks/s, output: 4579.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:49<00:03, 114.76it/s, est. speed input: 8504.71 toks/s, output: 4676.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:49<00:03, 109.84it/s, est. speed input: 8624.57 toks/s, output: 4786.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:49<00:02, 132.47it/s, est. speed input: 8873.08 toks/s, output: 4978.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:49<00:02, 121.16it/s, est. speed input: 8979.52 toks/s, output: 5075.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:49<00:02, 114.11it/s, est. speed input: 9089.00 toks/s, output: 5150.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:50<00:02, 97.92it/s, est. speed input: 9185.57 toks/s, output: 5238.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:50<00:02, 101.52it/s, est. speed input: 9379.50 toks/s, output: 5396.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:50<00:02, 106.37it/s, est. speed input: 9496.45 toks/s, output: 5489.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:50<00:02, 91.52it/s, est. speed input: 9587.96 toks/s, output: 5571.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:50<00:02, 88.16it/s, est. speed input: 9651.29 toks/s, output: 5623.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:51<00:03, 67.27it/s, est. speed input: 9720.18 toks/s, output: 5676.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:51<00:02, 71.61it/s, est. speed input: 9786.33 toks/s, output: 5724.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:51<00:02, 83.83it/s, est. speed input: 9897.06 toks/s, output: 5864.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:51<00:01, 93.76it/s, est. speed input: 10004.33 toks/s, output: 5969.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:51<00:01, 88.71it/s, est. speed input: 10213.54 toks/s, output: 6174.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:51<00:01, 87.71it/s, est. speed input: 10314.15 toks/s, output: 6281.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:52<00:01, 81.83it/s, est. speed input: 10368.15 toks/s, output: 6362.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:52<00:01, 75.43it/s, est. speed input: 10422.65 toks/s, output: 6427.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:52<00:01, 68.86it/s, est. speed input: 10476.90 toks/s, output: 6499.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:52<00:01, 66.56it/s, est. speed input: 10532.12 toks/s, output: 6571.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:52<00:01, 62.10it/s, est. speed input: 10583.25 toks/s, output: 6634.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:53<00:01, 42.12it/s, est. speed input: 10589.33 toks/s, output: 6689.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:53<00:01, 43.29it/s, est. speed input: 10636.31 toks/s, output: 6777.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:53<00:00, 60.48it/s, est. speed input: 10772.00 toks/s, output: 6956.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:54<00:00, 44.66it/s, est. speed input: 10774.94 toks/s, output: 6998.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 49.41it/s, est. speed input: 10835.12 toks/s, output: 7082.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:54<00:00, 38.47it/s, est. speed input: 10835.43 toks/s, output: 7116.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:54<00:00, 33.37it/s, est. speed input: 10824.28 toks/s, output: 7133.92 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:54<00:00, 23.34it/s, est. speed input: 10824.28 toks/s, output: 7133.92 toks/s]
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.006
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.006
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.673
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.673
[36m(Runner pid=3309020)[0m min: 0.15
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 988673
[36m(Runner pid=3309020)[0m balanced_min: 988673
[36m(Runner pid=3309020)[0m max: 993039
[36m(Runner pid=3309020)[0m mean: 988673.0
[36m(Runner pid=3309020)[0m min: 984307
[36m(Runner pid=3309020)[0m minmax_diff: 8732
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.108
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.121
[36m(Runner pid=3309020)[0m throughput: 1164.066
[36m(Runner pid=3309020)[0m time_per_step: 849.327
[36m(Runner pid=3309020)[0m total_num_tokens: 1977346
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 464.867
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1479.0
[36m(Runner pid=3309020)[0m mean: 307.534
[36m(Runner pid=3309020)[0m min: 57.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.346
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.673
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 6.276964422019155e-05
[36m(Runner pid=3309020)[0m gen: 0.133
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.007
[36m(Runner pid=3309020)[0m update_actor: 0.285
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.124
[36m(Runner pid=3309020)[0m gen: 105.043
[36m(Runner pid=3309020)[0m old: 86.636
[36m(Runner pid=3309020)[0m ref: 88.044
[36m(Runner pid=3309020)[0m reward: 5.797
[36m(Runner pid=3309020)[0m step: 849.327
[36m(Runner pid=3309020)[0m update_actor: 563.068
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 60; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:13:08 [executor_base.py:219] It took 0.340488 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:14:34 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:13:08 [executor_base.py:219] It took 0.342939 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:14:34 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:14:34 [executor_base.py:208] It took 0.325751 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:14:52 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:14:53 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:14:53 [executor_base.py:208] It took 0.328065 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.2868851125240326, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.22735878825187683, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002520764246582985, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006221414660103619}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00023005651019047946, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0004857983731199056, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003723361296579242, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.20608773827552795, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010085467947646976}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00042196534923277795, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013258365215733647}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0005208140355534852, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.42942553758621216, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.0765707865357399, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0004585394053719938}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.39266541600227356, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00019983875972684473}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.27868011593818665, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.3293007016181946, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.042808640748262405, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00010724172898335382}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.11192701756954193, 'actor/pg_clipfrac': 0.0013054830487817526, 'actor/ppo_kl': 0.001570536638610065}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.12469762563705444, 'actor/pg_clipfrac': 0.0007401924231089652, 'actor/ppo_kl': -0.00021815211221110076}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002811513841152191, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008024307317100465}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002736126189120114, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000753741420339793}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.1615963876247406, 'actor/pg_clipfrac': 0.0009666505502536893, 'actor/ppo_kl': 5.546890861296561e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00041901535587385297, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017412352608516812}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.19410990178585052, 'actor/pg_clipfrac': 0.0016806722851470113, 'actor/ppo_kl': -0.0007736285915598273}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0003461672749835998, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004620287800207734}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.09450860321521759, 'actor/pg_clipfrac': 0.0019047618843615055, 'actor/ppo_kl': 0.0006512778345495462}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.26971548795700073, 'actor/pg_clipfrac': 0.0017867778660729527, 'actor/ppo_kl': 0.000300795363727957}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002311710559297353, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0019005457870662212}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.12297426164150238, 'actor/pg_clipfrac': 0.002597402548417449, 'actor/ppo_kl': -0.00041535942000336945}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5072246193885803, 'actor/pg_clipfrac': 0.0026648901402950287, 'actor/ppo_kl': 0.002082376042380929}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0004199187096673995, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003518460434861481}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003055767447222024, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005860520177520812}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.05967486649751663, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.486948743462563e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.35859182476997375, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006379394908435643}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.5557510852813721, 'actor/pg_clipfrac': 0.002739726100116968, 'actor/ppo_kl': 0.0014182522427290678}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.35281991958618164, 'actor/pg_clipfrac': 0.0010537407360970974, 'actor/ppo_kl': 0.000342818268109113}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2154960185289383, 'actor/pg_clipfrac': 0.0008051529875956476, 'actor/ppo_kl': 0.0014111631317064166}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0002952521026600152, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005733480211347342}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.294552743434906, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015695590991526842}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3494543433189392, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006415346288122237}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.5419065356254578, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -4.700588033301756e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.5464774370193481, 'actor/pg_clipfrac': 0.0013755158288404346, 'actor/ppo_kl': -0.0012247093254700303}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.45040199160575867, 'actor/pg_clipfrac': 0.001291155582293868, 'actor/ppo_kl': -0.0008320066262967885}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00021020998246967793, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001771750394254923}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.3898378312587738, 'actor/pg_clipfrac': 0.002148997038602829, 'actor/ppo_kl': -0.0020677668508142233}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00048400191008113325, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002396960335317999}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.27628839015960693, 'actor/pg_clipfrac': 0.000654450268484652, 'actor/ppo_kl': -0.0006173618021421134}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0004241579445078969, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00044084430555813015}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00030991953099146485, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008554175728932023}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.1227853000164032, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012665953254327178}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.031133878976106644, 'actor/pg_clipfrac': 0.002773925196379423, 'actor/ppo_kl': 0.0015714118489995599}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.031218498945236206, 'actor/pg_clipfrac': 0.0019102196674793959, 'actor/ppo_kl': -0.0010893420549109578}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.9408025145530701, 'actor/pg_clipfrac': 0.0024479804560542107, 'actor/ppo_kl': -0.0008638071594759822}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.15481169521808624, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005668654921464622}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.223812535405159, 'actor/pg_clipfrac': 0.0011737089371308684, 'actor/ppo_kl': 0.0005808715941384435}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.32934996485710144, 'actor/pg_clipfrac': 0.0012695725308731198, 'actor/ppo_kl': -0.0011850511655211449}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.3693113327026367, 'actor/pg_clipfrac': 0.0009389671613462269, 'actor/ppo_kl': 0.0013035607989877462}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.000308155344100669, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009107840596698225}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00032678377465344965, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009304388659074903}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.4015825688838959, 'actor/pg_clipfrac': 0.00136239780113101, 'actor/ppo_kl': 0.0010157242650166154}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0002667660592123866, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00019505200907588005}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1313910037279129, 'actor/pg_clipfrac': 0.0007022471982054412, 'actor/ppo_kl': 0.0011245427886024117}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.5050954222679138, 'actor/pg_clipfrac': 0.0027027027681469917, 'actor/ppo_kl': -0.0004022117063868791}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:36:39, 15.42s/it, est. speed input: 29.44 toks/s, output: 6.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<41:01, 6.56s/it, est. speed input: 60.12 toks/s, output: 12.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:15<22:42, 3.64s/it, est. speed input: 89.22 toks/s, output: 18.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%|▏ | 5/377 [00:16<10:01, 1.62s/it, est. speed input: 146.63 toks/s, output: 31.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 6/377 [00:16<07:24, 1.20s/it, est. speed input: 173.71 toks/s, output: 38.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 10/377 [00:16<02:53, 2.11it/s, est. speed input: 283.36 toks/s, output: 66.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 13/377 [00:16<01:54, 3.18it/s, est. speed input: 360.46 toks/s, output: 87.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 17/377 [00:16<01:09, 5.18it/s, est. speed input: 467.13 toks/s, output: 117.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 20/377 [00:16<00:53, 6.70it/s, est. speed input: 543.92 toks/s, output: 139.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 26/377 [00:17<00:31, 11.30it/s, est. speed input: 703.56 toks/s, output: 185.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 31/377 [00:17<00:23, 14.70it/s, est. speed input: 829.26 toks/s, output: 223.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 35/377 [00:17<00:20, 16.73it/s, est. speed input: 927.08 toks/s, output: 254.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 39/377 [00:17<00:17, 19.45it/s, est. speed input: 1028.74 toks/s, output: 286.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 44/377 [00:17<00:14, 23.57it/s, est. speed input: 1153.09 toks/s, output: 328.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 49/377 [00:17<00:12, 27.26it/s, est. speed input: 1275.70 toks/s, output: 368.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 53/377 [00:17<00:14, 23.03it/s, est. speed input: 1362.57 toks/s, output: 399.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 57/377 [00:18<00:12, 25.12it/s, est. speed input: 1456.48 toks/s, output: 434.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 61/377 [00:18<00:11, 26.91it/s, est. speed input: 1550.74 toks/s, output: 469.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 66/377 [00:18<00:10, 30.46it/s, est. speed input: 1665.30 toks/s, output: 514.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 76/377 [00:18<00:06, 44.17it/s, est. speed input: 1906.81 toks/s, output: 608.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 82/377 [00:18<00:06, 45.88it/s, est. speed input: 2045.71 toks/s, output: 663.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 87/377 [00:18<00:06, 45.04it/s, est. speed input: 2154.68 toks/s, output: 709.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 95/377 [00:18<00:05, 51.74it/s, est. speed input: 2344.24 toks/s, output: 785.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 101/377 [00:18<00:06, 45.50it/s, est. speed input: 2469.96 toks/s, output: 841.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 107/377 [00:19<00:05, 47.44it/s, est. speed input: 2598.91 toks/s, output: 898.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 119/377 [00:19<00:04, 63.31it/s, est. speed input: 2879.97 toks/s, output: 1022.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 126/377 [00:19<00:04, 57.98it/s, est. speed input: 3034.22 toks/s, output: 1089.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 136/377 [00:19<00:03, 61.91it/s, est. speed input: 3251.02 toks/s, output: 1192.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 143/377 [00:19<00:04, 58.21it/s, est. speed input: 3397.61 toks/s, output: 1263.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 150/377 [00:19<00:03, 58.68it/s, est. speed input: 3544.67 toks/s, output: 1336.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 156/377 [00:19<00:04, 53.84it/s, est. speed input: 3661.88 toks/s, output: 1397.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 163/377 [00:20<00:03, 55.96it/s, est. speed input: 3805.71 toks/s, output: 1472.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 169/377 [00:20<00:03, 55.38it/s, est. speed input: 3922.96 toks/s, output: 1536.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 178/377 [00:20<00:03, 63.46it/s, est. speed input: 4109.48 toks/s, output: 1639.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 186/377 [00:20<00:02, 66.95it/s, est. speed input: 4273.05 toks/s, output: 1731.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 199/377 [00:20<00:02, 83.02it/s, est. speed input: 4552.97 toks/s, output: 1885.53 toks/s]
Processed prompts: 56%|█████▌ | 211/377 [00:20<00:01, 92.96it/s, est. speed input: 4808.20 toks/s, output: 2029.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 221/377 [00:20<00:01, 78.06it/s, est. speed input: 4998.09 toks/s, output: 2142.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 230/377 [00:20<00:02, 59.36it/s, est. speed input: 5143.68 toks/s, output: 2238.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 238/377 [00:21<00:02, 61.88it/s, est. speed input: 5303.99 toks/s, output: 2337.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 245/377 [00:21<00:02, 55.62it/s, est. speed input: 5420.71 toks/s, output: 2417.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 252/377 [00:21<00:02, 53.97it/s, est. speed input: 5544.61 toks/s, output: 2503.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 258/377 [00:21<00:02, 53.95it/s, est. speed input: 5650.99 toks/s, output: 2578.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 265/377 [00:21<00:01, 56.63it/s, est. speed input: 5772.57 toks/s, output: 2669.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 271/377 [00:21<00:01, 55.39it/s, est. speed input: 5873.55 toks/s, output: 2745.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 282/377 [00:21<00:01, 61.52it/s, est. speed input: 6075.09 toks/s, output: 2895.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 290/377 [00:21<00:01, 64.88it/s, est. speed input: 6220.44 toks/s, output: 3007.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 297/377 [00:22<00:01, 63.56it/s, est. speed input: 6339.65 toks/s, output: 3105.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 305/377 [00:22<00:01, 67.16it/s, est. speed input: 6478.71 toks/s, output: 3223.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 312/377 [00:22<00:01, 59.89it/s, est. speed input: 6584.01 toks/s, output: 3320.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 321/377 [00:22<00:00, 60.95it/s, est. speed input: 6735.39 toks/s, output: 3453.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 328/377 [00:22<00:00, 53.37it/s, est. speed input: 6831.41 toks/s, output: 3549.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 334/377 [00:22<00:00, 50.68it/s, est. speed input: 6914.19 toks/s, output: 3638.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 340/377 [00:23<00:00, 41.66it/s, est. speed input: 6975.85 toks/s, output: 3718.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:23<00:01, 31.63it/s, est. speed input: 6992.90 toks/s, output: 3773.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 349/377 [00:23<00:01, 27.56it/s, est. speed input: 7010.12 toks/s, output: 3822.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▎| 353/377 [00:23<00:01, 22.28it/s, est. speed input: 7007.77 toks/s, output: 3861.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 356/377 [00:24<00:01, 20.16it/s, est. speed input: 7008.87 toks/s, output: 3895.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 359/377 [00:24<00:01, 15.93it/s, est. speed input: 6972.87 toks/s, output: 3914.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 362/377 [00:24<00:00, 15.83it/s, est. speed input: 6977.63 toks/s, output: 3956.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 364/377 [00:25<00:01, 8.69it/s, est. speed input: 6828.93 toks/s, output: 3901.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 366/377 [00:25<00:01, 8.00it/s, est. speed input: 6777.58 toks/s, output: 3907.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [00:25<00:01, 6.94it/s, est. speed input: 6706.41 toks/s, output: 3902.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [00:26<00:00, 7.83it/s, est. speed input: 6703.82 toks/s, output: 3939.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [00:39<00:00, 7.83it/s, est. speed input: 6703.82 toks/s, output: 3939.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [01:05<00:36, 6.07s/it, est. speed input: 2673.00 toks/s, output: 1650.19 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:05<00:00, 5.74it/s, est. speed input: 2713.73 toks/s, output: 2151.50 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.24671196937561035, 'actor/pg_clipfrac': 0.003241491038352251, 'actor/ppo_kl': -0.0003020767180714756}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.32224568724632263, 'actor/pg_clipfrac': 0.0018018018454313278, 'actor/ppo_kl': -0.00014390228898264468}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.3260009288787842, 'actor/pg_clipfrac': 0.0012210012646391988, 'actor/ppo_kl': -8.325029193656519e-05}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:28:00 [executor_base.py:219] It took 0.340128 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:28:00 [executor_base.py:219] It took 0.340517 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:29:50 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:29:50 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:29:50 [executor_base.py:208] It took 0.326746 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:29:56 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:29:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:29:56 [executor_base.py:208] It took 0.329213 seconds to fall asleep.
[36m(Runner pid=3309020)[0m validation generation end
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nTo find the minimum perimeter of triangle AEC, we need to understand the geometric properties and relationships in the given figure. Since D is the midpoint of AB, AD = DB = 2.0. The line l is the vertical line passing through D, and E is any point on l. The perimeter of triangle AEC is the sum of the lengths of AE, EC, and AC.\n\nGiven that the perimeter of triangle ABC is 16.0, we can express this as:\n\\[ AB + BC + CA = 16.0 \\]\n\nSince D is the midpoint of AB, AD = DB = 2.0. Therefore, the length of AC is equal to the length of BC because D is the midpoint and the triangle's sides are symmetric about AB. This means that AC = BC.\n\nNow, to minimize the perimeter of triangle AEC, we need to consider that the shortest path between two points is a straight line. Therefore, the minimum perimeter of triangle AEC will be achieved when E is at point A or point C, because then AE = 0 and EC = AC or AE = AC and EC = 0, respectively. However, since E is on the line l and D is the midpoint of AB, the minimum perimeter occurs when E is at point D.\n\nThus, the minimum perimeter of triangle AEC is the sum of the lengths of AD and DC, which is 2.0 + 2.0 = 4.0.\n\nTherefore, the minimum perimeter of triangle AEC is 4.0.\n\n\n4.0\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated clockwise around point A by 90.0 degrees to form triangle AB'C', we know that angle BAC is 90.0 degrees. Since the rotation is 90.0 degrees, angle B'AC' is also 90.0 degrees. We are given that angle CC'B' is 32.0 degrees. To find the size of angle AC'B', we can use the fact that the sum of angles in a triangle is 180.0 degrees.\n\nIn triangle AC'B', we have:\n- Angle BAC' = 90.0 degrees (since angle BAC is rotated to angle B'AC')\n- Angle CC'B' = 32.0 degrees\n- Angle AC'B' = 180.0 - 90.0 - 32.0\n\nLet's calculate the size of angle AC'B':\nAngle AC'B' = 180.0 - 90.0 - 32.0 = 58.0 degrees.\n\nTherefore, the size of angle AC'B' is 58.0 degrees.\n\n\n58.0\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that A, B, and D are points on circle O, and C is on the diameter BE, we can deduce that angle ADC is an inscribed angle that subtends the arc AD. Since angle ADC = 54°, the central angle subtended by the same arc, angle AOB, is twice the inscribed angle, which is 108°. Because BE is the diameter, angle AEB is a right angle (90°), and thus angle AEB can be found by subtracting angle AOB from 180° (since the sum of angles in a triangle is 180°). Therefore, angle AEB = 180° - 108° = 72°.\n\n\nThe degree of angle AEB is 72°.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AB is the diameter of circle O, and CD is tangent to circle O at point D, we can use the properties of tangents and circles to find the measure of angle C. Since AB is the diameter, angle ADB is a right angle (90 degrees) because any angle subtended by a diameter in a semicircle is a right angle. Given that angle A is 35 degrees, we can find angle DBC by subtracting angle A from 90 degrees, which gives us angle DBC = 90 - 35 = 55 degrees. Since CD is tangent to the circle, angle ODC is also 90 degrees. Therefore, angle C can be found by subtracting angle DBC from 90 degrees. Thus, angle C = 90 - 55 = 35 degrees.\n\n\n35 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, angle ACD is a right angle (90 degrees) because any angle subtended by a diameter in a semicircle is a right angle. Given that angle EAC = 120 degrees, we can find angle CAD by subtracting the given angle from 90 degrees, since angle CAD = 90 degrees - 120 degrees = -30 degrees. However, since angles are typically measured in positive directions, we take the absolute value and consider it as 30 degrees. Now, angle ABC is an inscribed angle that subtends the same arc as angle ADC, which is the same as angle CAD because they are subtended by the same arc AD. Therefore, angle ABC = angle CAD = 30 degrees.\n\n\nThe degree of angle ABC is 30 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_45
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_60/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_60/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_60/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 60
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.251
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.039
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.018
[36m(Runner pid=3309020)[0m ppo_kl: 3.609788204244069e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.025
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.025
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.668
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.668
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 995247
[36m(Runner pid=3309020)[0m balanced_min: 993689
[36m(Runner pid=3309020)[0m max: 1003998
[36m(Runner pid=3309020)[0m mean: 994468.0
[36m(Runner pid=3309020)[0m min: 984938
[36m(Runner pid=3309020)[0m minmax_diff: 19060
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.23
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.122
[36m(Runner pid=3309020)[0m throughput: 937.127
[36m(Runner pid=3309020)[0m time_per_step: 1061.188
[36m(Runner pid=3309020)[0m total_num_tokens: 1988936
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 671.0
[36m(Runner pid=3309020)[0m mean: 465.242
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3282.0
[36m(Runner pid=3309020)[0m mean: 311.686
[36m(Runner pid=3309020)[0m min: 61.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.339
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.668
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.507583551684405e-05
[36m(Runner pid=3309020)[0m gen: 0.148
[36m(Runner pid=3309020)[0m old: 0.043
[36m(Runner pid=3309020)[0m ref: 0.043
[36m(Runner pid=3309020)[0m reward: 0.007
[36m(Runner pid=3309020)[0m update_actor: 0.283
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.149
[36m(Runner pid=3309020)[0m gen: 118.261
[36m(Runner pid=3309020)[0m old: 85.775
[36m(Runner pid=3309020)[0m ref: 86.27
[36m(Runner pid=3309020)[0m reward: 5.923
[36m(Runner pid=3309020)[0m save_checkpoint: 31.798
[36m(Runner pid=3309020)[0m step: 1061.188
[36m(Runner pid=3309020)[0m update_actor: 563.641
[36m(Runner pid=3309020)[0m validation: 168.763
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.411
[36m(Runner pid=3309020)[0m format_reward: 0.975
[36m(Runner pid=3309020)[0m overall_reward: 0.694
[36m(Runner pid=3309020)[0m reward_score: 0.694
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.981
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_60/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_60/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_60/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m Training Episode 4.
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:30:09, 4.24s/it, est. speed input: 102.52 toks/s, output: 26.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<50:10, 2.37s/it, est. speed input: 169.90 toks/s, output: 45.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:28<31:40, 1.50s/it, est. speed input: 238.19 toks/s, output: 68.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:29<19:27, 1.08it/s, est. speed input: 310.85 toks/s, output: 94.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<16:36, 1.26it/s, est. speed input: 354.62 toks/s, output: 108.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:32<11:17, 1.85it/s, est. speed input: 424.30 toks/s, output: 130.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:33<09:40, 2.14it/s, est. speed input: 471.03 toks/s, output: 145.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<06:52, 3.01it/s, est. speed input: 532.84 toks/s, output: 167.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:35<05:10, 3.96it/s, est. speed input: 637.57 toks/s, output: 203.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<04:15, 4.79it/s, est. speed input: 694.35 toks/s, output: 223.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:37<04:19, 4.71it/s, est. speed input: 731.50 toks/s, output: 244.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:37<03:17, 6.16it/s, est. speed input: 791.44 toks/s, output: 269.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:37<01:57, 10.24it/s, est. speed input: 905.71 toks/s, output: 311.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<00:51, 22.95it/s, est. speed input: 1212.80 toks/s, output: 428.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<00:56, 20.87it/s, est. speed input: 1313.98 toks/s, output: 463.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:38<00:57, 20.09it/s, est. speed input: 1364.26 toks/s, output: 479.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:53, 21.67it/s, est. speed input: 1419.63 toks/s, output: 498.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:40, 28.36it/s, est. speed input: 1533.66 toks/s, output: 544.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<00:38, 29.49it/s, est. speed input: 1585.18 toks/s, output: 570.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:39<00:44, 25.71it/s, est. speed input: 1632.60 toks/s, output: 592.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:39<00:41, 27.48it/s, est. speed input: 1685.63 toks/s, output: 611.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:39<00:28, 39.87it/s, est. speed input: 1850.82 toks/s, output: 682.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:28, 39.02it/s, est. speed input: 1904.10 toks/s, output: 708.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:40<00:31, 34.72it/s, est. speed input: 2004.72 toks/s, output: 749.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:40<00:55, 19.88it/s, est. speed input: 2028.15 toks/s, output: 760.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:40<00:49, 22.24it/s, est. speed input: 2078.06 toks/s, output: 784.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:41<00:38, 28.46it/s, est. speed input: 2184.99 toks/s, output: 828.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:41<00:29, 36.87it/s, est. speed input: 2338.80 toks/s, output: 902.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:41<00:35, 29.93it/s, est. speed input: 2375.06 toks/s, output: 922.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:41<00:27, 38.21it/s, est. speed input: 2477.28 toks/s, output: 977.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:41<00:22, 46.42it/s, est. speed input: 2584.07 toks/s, output: 1031.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:42<00:14, 71.25it/s, est. speed input: 2796.66 toks/s, output: 1131.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:42<00:24, 41.79it/s, est. speed input: 2860.28 toks/s, output: 1178.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:42<00:17, 57.07it/s, est. speed input: 3064.03 toks/s, output: 1284.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:43<00:16, 59.31it/s, est. speed input: 3317.31 toks/s, output: 1418.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:43<00:18, 52.15it/s, est. speed input: 3403.95 toks/s, output: 1462.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:43<00:24, 38.52it/s, est. speed input: 3469.03 toks/s, output: 1496.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:44<00:20, 44.84it/s, est. speed input: 3565.38 toks/s, output: 1549.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:44<00:17, 51.83it/s, est. speed input: 3662.20 toks/s, output: 1594.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:44<00:14, 61.81it/s, est. speed input: 3809.35 toks/s, output: 1663.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:44<00:15, 57.76it/s, est. speed input: 3889.43 toks/s, output: 1703.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:45<00:25, 34.49it/s, est. speed input: 3939.28 toks/s, output: 1735.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:45<00:25, 34.17it/s, est. speed input: 4013.29 toks/s, output: 1788.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:45<00:17, 50.88it/s, est. speed input: 4204.36 toks/s, output: 1885.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:45<00:18, 45.68it/s, est. speed input: 4283.24 toks/s, output: 1931.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:46<00:15, 53.49it/s, est. speed input: 4424.37 toks/s, output: 1988.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:46<00:17, 48.71it/s, est. speed input: 4504.42 toks/s, output: 2038.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:46<00:14, 57.54it/s, est. speed input: 4670.35 toks/s, output: 2136.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:46<00:15, 52.06it/s, est. speed input: 4740.27 toks/s, output: 2167.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:47<00:13, 57.30it/s, est. speed input: 4870.93 toks/s, output: 2244.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:47<00:16, 47.72it/s, est. speed input: 4938.78 toks/s, output: 2287.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:47<00:18, 42.20it/s, est. speed input: 5005.54 toks/s, output: 2320.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:47<00:17, 44.19it/s, est. speed input: 5083.10 toks/s, output: 2378.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:47<00:16, 44.31it/s, est. speed input: 5118.54 toks/s, output: 2400.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:48<00:11, 66.08it/s, est. speed input: 5297.92 toks/s, output: 2509.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:48<00:09, 72.03it/s, est. speed input: 5385.16 toks/s, output: 2570.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:48<00:05, 119.28it/s, est. speed input: 5650.05 toks/s, output: 2742.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:48<00:05, 112.86it/s, est. speed input: 5822.68 toks/s, output: 2869.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:48<00:07, 87.16it/s, est. speed input: 5929.63 toks/s, output: 2948.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:48<00:07, 89.94it/s, est. speed input: 6052.38 toks/s, output: 3032.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:49<00:07, 79.95it/s, est. speed input: 6171.39 toks/s, output: 3125.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:49<00:07, 77.53it/s, est. speed input: 6291.11 toks/s, output: 3219.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:49<00:08, 71.92it/s, est. speed input: 6357.57 toks/s, output: 3251.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:49<00:07, 83.13it/s, est. speed input: 6479.44 toks/s, output: 3333.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:49<00:08, 69.92it/s, est. speed input: 6541.81 toks/s, output: 3389.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:50<00:05, 94.88it/s, est. speed input: 6764.94 toks/s, output: 3520.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:50<00:05, 96.95it/s, est. speed input: 6876.88 toks/s, output: 3622.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:50<00:07, 66.93it/s, est. speed input: 6959.96 toks/s, output: 3691.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:50<00:06, 78.19it/s, est. speed input: 7077.76 toks/s, output: 3796.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:50<00:03, 141.45it/s, est. speed input: 7487.15 toks/s, output: 4064.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:50<00:03, 130.74it/s, est. speed input: 7654.11 toks/s, output: 4192.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:51<00:03, 125.36it/s, est. speed input: 7802.92 toks/s, output: 4300.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:51<00:03, 107.30it/s, est. speed input: 7944.97 toks/s, output: 4428.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:51<00:04, 93.42it/s, est. speed input: 8044.04 toks/s, output: 4506.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:51<00:04, 87.32it/s, est. speed input: 8147.29 toks/s, output: 4593.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:52<00:03, 92.70it/s, est. speed input: 8376.87 toks/s, output: 4789.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:52<00:02, 139.41it/s, est. speed input: 8708.84 toks/s, output: 5054.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:52<00:01, 147.12it/s, est. speed input: 8861.58 toks/s, output: 5193.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:52<00:01, 143.59it/s, est. speed input: 9004.95 toks/s, output: 5318.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:52<00:01, 137.86it/s, est. speed input: 9151.42 toks/s, output: 5444.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:52<00:02, 104.05it/s, est. speed input: 9283.23 toks/s, output: 5584.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:53<00:02, 92.41it/s, est. speed input: 9406.20 toks/s, output: 5698.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:53<00:01, 96.86it/s, est. speed input: 9507.60 toks/s, output: 5809.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:53<00:01, 103.18it/s, est. speed input: 9614.28 toks/s, output: 5912.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:53<00:01, 91.58it/s, est. speed input: 9708.23 toks/s, output: 6027.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:54<00:01, 75.87it/s, est. speed input: 9785.85 toks/s, output: 6117.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:54<00:01, 71.97it/s, est. speed input: 9840.86 toks/s, output: 6194.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:54<00:01, 79.21it/s, est. speed input: 9949.19 toks/s, output: 6307.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:54<00:01, 66.40it/s, est. speed input: 9994.67 toks/s, output: 6388.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:54<00:01, 81.00it/s, est. speed input: 10112.46 toks/s, output: 6490.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:54<00:00, 88.67it/s, est. speed input: 10211.21 toks/s, output: 6612.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:54<00:00, 103.35it/s, est. speed input: 10390.33 toks/s, output: 6808.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:55<00:00, 89.84it/s, est. speed input: 10543.94 toks/s, output: 6993.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:55<00:00, 55.16it/s, est. speed input: 10532.64 toks/s, output: 7029.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:56<00:00, 40.64it/s, est. speed input: 10526.11 toks/s, output: 7072.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:56<00:00, 22.57it/s, est. speed input: 10495.97 toks/s, output: 7073.35 toks/s]
[36m(Runner pid=3309020)[0m ==============================
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 61; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:31:16 [executor_base.py:219] It took 0.344526 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:32:48 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:31:16 [executor_base.py:219] It took 0.348294 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:32:48 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:32:48 [executor_base.py:208] It took 0.327125 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:35:00 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:35:00 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:35:00 [executor_base.py:208] It took 0.325101 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00030003878055140376, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2292870581150055, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005475314101204276}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002992306835949421, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007405496435239911}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.4674745202064514, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002157518028980121, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.14320766925811768, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.2991117537021637, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.3647507429122925, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.25824517011642456, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.3712461292743683, 'actor/pg_clipfrac': 0.0010542962700128555, 'actor/ppo_kl': -0.0004526468983385712}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.05511362850666046, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0003900917072314769, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001396981970174238}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.7327921986579895, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017122541321441531}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.5082240104675293, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.5122632384300232, 'actor/pg_clipfrac': 0.0019656019285321236, 'actor/ppo_kl': -0.0003140920598525554}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.10531012713909149, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.18881425261497498, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016681820852681994}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.17361308634281158, 'actor/pg_clipfrac': 0.0012285012053325772, 'actor/ppo_kl': -0.0001315758127020672}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2803502380847931, 'actor/pg_clipfrac': 0.001347708865068853, 'actor/ppo_kl': 0.0007001666235737503}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00045214834972284734, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012224757811054587}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.17251183092594147, 'actor/pg_clipfrac': 0.004149377811700106, 'actor/ppo_kl': -0.00011482871923362836}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4023367166519165, 'actor/pg_clipfrac': 0.0012004801537841558, 'actor/ppo_kl': -0.0004143514670431614}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.22720983624458313, 'actor/pg_clipfrac': 0.0009049773798324168, 'actor/ppo_kl': -0.00034456036519259214}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2430577278137207, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009079416049644351}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0004422226338647306, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011987988837063313}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.41992685198783875, 'actor/pg_clipfrac': 0.0016447368543595076, 'actor/ppo_kl': 0.0011052426416426897}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.26085007190704346, 'actor/pg_clipfrac': 0.0006968640955165029, 'actor/ppo_kl': -2.3313516521739075e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0003765162837225944, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004847844538744539}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.13214111328125, 'actor/pg_clipfrac': 0.0031298904214054346, 'actor/ppo_kl': 6.409318302758038e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0005313390283845365, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014452273026108742}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.16747897863388062, 'actor/pg_clipfrac': 0.0015174506697803736, 'actor/ppo_kl': 0.0012540281750261784}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.7609363198280334, 'actor/pg_clipfrac': 0.0030562346801161766, 'actor/ppo_kl': 0.00041912004235200584}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00030081800650805235, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000840820197481662}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.09754932671785355, 'actor/pg_clipfrac': 0.0009823183063417673, 'actor/ppo_kl': 3.537400971254101e-06}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.20340019464492798, 'actor/pg_clipfrac': 0.0026845638640224934, 'actor/ppo_kl': -0.0010329842334613204}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.2938876450061798, 'actor/pg_clipfrac': 0.001609010505490005, 'actor/ppo_kl': 2.243858398287557e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0003008809871971607, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005375609616748989}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00030264639644883573, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007448737742379308}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.11720318347215652, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012259139912202954}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.13394561409950256, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001732214936055243}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00037687114672735333, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003064478514716029}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.15116718411445618, 'actor/pg_clipfrac': 0.0021897810511291027, 'actor/ppo_kl': -0.0016697528772056103}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00024309230502694845, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006879919674247503}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.5825226306915283, 'actor/pg_clipfrac': 0.0016652789199724793, 'actor/ppo_kl': -0.0005399146466515958}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.25536489486694336, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006443614838644862}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.18211956322193146, 'actor/pg_clipfrac': 0.0006142506026662886, 'actor/ppo_kl': 0.0003199577331542969}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.2345048487186432, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010410577524453402}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.722414493560791, 'actor/pg_clipfrac': 0.003960396163165569, 'actor/ppo_kl': 0.0005021992255933583}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.24992989003658295, 'actor/pg_clipfrac': 0.0005827505956403911, 'actor/ppo_kl': 0.0004901652573607862}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.43441057205200195, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006008690106682479}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.28364163637161255, 'actor/pg_clipfrac': 0.0017231475794687867, 'actor/ppo_kl': 0.0005975840613245964}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.09259209036827087, 'actor/pg_clipfrac': 0.0033240998163819313, 'actor/ppo_kl': -0.0006493438850156963}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.4433755576610565, 'actor/pg_clipfrac': 0.0015090543311089277, 'actor/ppo_kl': 0.00030248553957790136}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003889180952683091, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00047031912254169583}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.22364526987075806, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000681252044159919}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.5854781270027161, 'actor/pg_clipfrac': 0.0014144271844998002, 'actor/ppo_kl': 0.00017588614718988538}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.22548462450504303, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014044779818505049}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.28897911310195923, 'actor/pg_clipfrac': 0.000834724516607821, 'actor/ppo_kl': -0.0013136346824467182}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0031796819530427456, 'actor/pg_clipfrac': 0.0005443657864816487, 'actor/ppo_kl': -0.0021375957876443863}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0003204625390935689, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007526036351919174}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.22075295448303223, 'actor/pg_clipfrac': 0.0012674271129071712, 'actor/ppo_kl': 7.391520193777978e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00026176971732638776, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -9.063346078619361e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0003217466874048114, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009567966335453093}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0910169929265976, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010722728911787271}
[36m(Runner pid=3309020)[0m Step 61
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.257
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.038
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.006
[36m(Runner pid=3309020)[0m ppo_kl: -3.5566252906349405e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.675
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.675
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 996194
[36m(Runner pid=3309020)[0m balanced_min: 996193
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:31:08, 4.29s/it, est. speed input: 104.23 toks/s, output: 27.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<46:40, 2.20s/it, est. speed input: 180.93 toks/s, output: 47.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:25<25:40, 1.22s/it, est. speed input: 271.17 toks/s, output: 72.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<19:50, 1.06it/s, est. speed input: 324.96 toks/s, output: 89.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:29<14:38, 1.43it/s, est. speed input: 390.50 toks/s, output: 109.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:29<10:28, 1.99it/s, est. speed input: 459.61 toks/s, output: 132.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<08:36, 2.41it/s, est. speed input: 516.77 toks/s, output: 154.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<06:42, 3.08it/s, est. speed input: 579.72 toks/s, output: 175.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<05:26, 3.79it/s, est. speed input: 636.42 toks/s, output: 197.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:32<04:01, 5.09it/s, est. speed input: 702.84 toks/s, output: 220.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<05:04, 4.02it/s, est. speed input: 729.85 toks/s, output: 231.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:34<02:50, 7.14it/s, est. speed input: 867.06 toks/s, output: 274.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:34<02:21, 8.56it/s, est. speed input: 927.58 toks/s, output: 293.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:35<02:13, 9.04it/s, est. speed input: 975.04 toks/s, output: 317.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:35<01:56, 10.29it/s, est. speed input: 1033.95 toks/s, output: 336.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:36<02:14, 8.90it/s, est. speed input: 1078.63 toks/s, output: 353.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:36<01:45, 11.26it/s, est. speed input: 1136.14 toks/s, output: 379.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:36<01:30, 13.15it/s, est. speed input: 1192.69 toks/s, output: 393.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:37<00:55, 21.00it/s, est. speed input: 1371.44 toks/s, output: 464.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:37<00:51, 22.84it/s, est. speed input: 1430.00 toks/s, output: 486.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<00:37, 30.43it/s, est. speed input: 1548.74 toks/s, output: 528.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:37<00:50, 22.77it/s, est. speed input: 1589.37 toks/s, output: 539.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:27, 41.15it/s, est. speed input: 1877.58 toks/s, output: 664.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:38<00:30, 37.25it/s, est. speed input: 1928.77 toks/s, output: 683.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:38<00:30, 37.03it/s, est. speed input: 1984.79 toks/s, output: 710.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:38<00:24, 44.33it/s, est. speed input: 2105.41 toks/s, output: 755.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:38<00:26, 41.62it/s, est. speed input: 2210.87 toks/s, output: 798.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:39<00:26, 40.73it/s, est. speed input: 2266.14 toks/s, output: 817.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:39<00:27, 39.97it/s, est. speed input: 2316.85 toks/s, output: 842.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:50, 21.35it/s, est. speed input: 2340.63 toks/s, output: 854.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:39<00:38, 27.89it/s, est. speed input: 2446.98 toks/s, output: 898.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:40<00:28, 36.72it/s, est. speed input: 2561.96 toks/s, output: 947.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:40<00:31, 33.28it/s, est. speed input: 2657.16 toks/s, output: 986.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:40<00:27, 38.48it/s, est. speed input: 2759.84 toks/s, output: 1038.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:40<00:25, 39.77it/s, est. speed input: 2853.61 toks/s, output: 1076.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:41<00:17, 57.94it/s, est. speed input: 3073.44 toks/s, output: 1192.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:41<00:17, 57.96it/s, est. speed input: 3175.63 toks/s, output: 1250.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:41<00:27, 36.24it/s, est. speed input: 3243.14 toks/s, output: 1294.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:41<00:21, 46.26it/s, est. speed input: 3392.99 toks/s, output: 1371.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:42<00:24, 39.06it/s, est. speed input: 3469.49 toks/s, output: 1405.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:42<00:31, 30.71it/s, est. speed input: 3535.06 toks/s, output: 1443.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:42<00:29, 32.72it/s, est. speed input: 3580.93 toks/s, output: 1456.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:43<00:25, 36.61it/s, est. speed input: 3670.00 toks/s, output: 1509.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:43<00:16, 55.87it/s, est. speed input: 3871.52 toks/s, output: 1618.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:43<00:12, 73.33it/s, est. speed input: 4072.88 toks/s, output: 1731.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:14, 62.61it/s, est. speed input: 4161.87 toks/s, output: 1773.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:13, 64.22it/s, est. speed input: 4252.61 toks/s, output: 1811.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:43<00:11, 75.27it/s, est. speed input: 4392.34 toks/s, output: 1894.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:44<00:12, 69.98it/s, est. speed input: 4485.57 toks/s, output: 1946.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:44<00:13, 61.82it/s, est. speed input: 4566.53 toks/s, output: 1994.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:44<00:12, 65.47it/s, est. speed input: 4656.50 toks/s, output: 2054.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:44<00:09, 87.90it/s, est. speed input: 4846.74 toks/s, output: 2165.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:44<00:09, 86.03it/s, est. speed input: 4935.85 toks/s, output: 2199.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:44<00:09, 84.97it/s, est. speed input: 5023.83 toks/s, output: 2247.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:45<00:10, 71.53it/s, est. speed input: 5107.91 toks/s, output: 2309.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:45<00:10, 74.45it/s, est. speed input: 5194.47 toks/s, output: 2359.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:45<00:09, 76.99it/s, est. speed input: 5290.01 toks/s, output: 2412.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:45<00:08, 90.42it/s, est. speed input: 5433.04 toks/s, output: 2477.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:45<00:17, 43.44it/s, est. speed input: 5471.11 toks/s, output: 2506.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:46<00:14, 48.71it/s, est. speed input: 5552.77 toks/s, output: 2560.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:46<00:12, 56.65it/s, est. speed input: 5633.82 toks/s, output: 2617.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:09, 70.05it/s, est. speed input: 5814.91 toks/s, output: 2744.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:46<00:10, 62.72it/s, est. speed input: 5930.59 toks/s, output: 2809.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:46<00:09, 70.23it/s, est. speed input: 6054.69 toks/s, output: 2888.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:47<00:10, 62.64it/s, est. speed input: 6123.25 toks/s, output: 2952.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 645/1280 [00:47<00:06, 91.28it/s, est. speed input: 6355.18 toks/s, output: 3066.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:47<00:06, 98.52it/s, est. speed input: 6492.31 toks/s, output: 3145.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:47<00:06, 94.09it/s, est. speed input: 6614.77 toks/s, output: 3224.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:47<00:05, 106.38it/s, est. speed input: 6786.99 toks/s, output: 3354.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:47<00:05, 100.85it/s, est. speed input: 6914.99 toks/s, output: 3438.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:47<00:05, 96.59it/s, est. speed input: 7031.45 toks/s, output: 3517.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:48<00:05, 98.56it/s, est. speed input: 7156.27 toks/s, output: 3611.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:48<00:05, 100.41it/s, est. speed input: 7280.54 toks/s, output: 3702.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:48<00:05, 99.36it/s, est. speed input: 7448.49 toks/s, output: 3822.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:48<00:04, 98.70it/s, est. speed input: 7578.07 toks/s, output: 3914.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:48<00:04, 108.04it/s, est. speed input: 7696.84 toks/s, output: 4004.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:48<00:05, 84.69it/s, est. speed input: 7790.10 toks/s, output: 4093.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:49<00:04, 99.08it/s, est. speed input: 7959.99 toks/s, output: 4223.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:49<00:03, 120.17it/s, est. speed input: 8175.00 toks/s, output: 4382.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:49<00:03, 123.43it/s, est. speed input: 8388.56 toks/s, output: 4550.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:49<00:03, 113.60it/s, est. speed input: 8497.06 toks/s, output: 4640.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:49<00:03, 110.53it/s, est. speed input: 8608.05 toks/s, output: 4732.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:50<00:03, 90.94it/s, est. speed input: 8708.48 toks/s, output: 4808.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:50<00:02, 108.83it/s, est. speed input: 8914.32 toks/s, output: 4986.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 975/1280 [00:50<00:03, 96.56it/s, est. speed input: 9011.54 toks/s, output: 5055.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:50<00:02, 106.27it/s, est. speed input: 9162.64 toks/s, output: 5189.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:50<00:02, 93.65it/s, est. speed input: 9257.85 toks/s, output: 5278.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:50<00:02, 93.70it/s, est. speed input: 9332.28 toks/s, output: 5355.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:50<00:02, 106.45it/s, est. speed input: 9489.32 toks/s, output: 5484.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:51<00:01, 113.07it/s, est. speed input: 9607.12 toks/s, output: 5580.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:51<00:01, 131.51it/s, est. speed input: 9768.71 toks/s, output: 5741.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1100/1280 [00:51<00:01, 160.58it/s, est. speed input: 9976.84 toks/s, output: 5929.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:00, 188.50it/s, est. speed input: 10222.82 toks/s, output: 6191.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:51<00:00, 153.59it/s, est. speed input: 10368.02 toks/s, output: 6297.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:51<00:01, 104.78it/s, est. speed input: 10485.75 toks/s, output: 6419.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:52<00:01, 73.31it/s, est. speed input: 10535.86 toks/s, output: 6490.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:52<00:00, 80.71it/s, est. speed input: 10649.48 toks/s, output: 6604.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:52<00:00, 71.17it/s, est. speed input: 10722.38 toks/s, output: 6716.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:53<00:00, 63.90it/s, est. speed input: 10763.42 toks/s, output: 6765.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:53<00:00, 63.22it/s, est. speed input: 10810.04 toks/s, output: 6854.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:53<00:00, 41.72it/s, est. speed input: 10793.82 toks/s, output: 6881.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:54<00:00, 35.97it/s, est. speed input: 10808.94 toks/s, output: 6950.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:54<00:00, 35.52it/s, est. speed input: 10823.89 toks/s, output: 6966.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:54<00:00, 34.25it/s, est. speed input: 10837.03 toks/s, output: 6998.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:54<00:00, 30.12it/s, est. speed input: 10833.47 toks/s, output: 7019.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:55<00:00, 16.65it/s, est. speed input: 10711.51 toks/s, output: 6988.50 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:55<00:00, 22.99it/s, est. speed input: 10711.51 toks/s, output: 6988.50 toks/s]
[36m(Runner pid=3309020)[0m max: 996312
[36m(Runner pid=3309020)[0m mean: 996193.5
[36m(Runner pid=3309020)[0m min: 996075
[36m(Runner pid=3309020)[0m minmax_diff: 237
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 109.579
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.013
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.122
[36m(Runner pid=3309020)[0m throughput: 1006.779
[36m(Runner pid=3309020)[0m time_per_step: 989.486
[36m(Runner pid=3309020)[0m total_num_tokens: 1992387
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 607.0
[36m(Runner pid=3309020)[0m mean: 465.039
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1423.0
[36m(Runner pid=3309020)[0m mean: 313.237
[36m(Runner pid=3309020)[0m min: 67.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.352
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.675
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.337144642149856e-05
[36m(Runner pid=3309020)[0m gen: 0.304
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.009
[36m(Runner pid=3309020)[0m update_actor: 0.282
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.146
[36m(Runner pid=3309020)[0m gen: 243.449
[36m(Runner pid=3309020)[0m old: 88.466
[36m(Runner pid=3309020)[0m ref: 87.538
[36m(Runner pid=3309020)[0m reward: 6.824
[36m(Runner pid=3309020)[0m step: 989.486
[36m(Runner pid=3309020)[0m update_actor: 562.321
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 62; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.07 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:47:52 [executor_base.py:219] It took 0.344110 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.71 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:49:21 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:47:52 [executor_base.py:219] It took 0.345440 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:49:21 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.79 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 12:49:21 [executor_base.py:208] It took 0.325351 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.79 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:49:56 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:49:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 12:49:56 [executor_base.py:208] It took 0.328198 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.07525692135095596, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.49796363711357117, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.03746228292584419, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0005193090764805675, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00020336126908659935, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.24987226724624634, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013800807064399123}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3075273931026459, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00024973065592348576, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00018333815387450159}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.07539571821689606, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00022611935855820775, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003459910221863538}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.2710559070110321, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.10063489526510239, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1290966272354126, 'actor/pg_clipfrac': 0.001088139251805842, 'actor/ppo_kl': -0.00047774205449968576}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.21282914280891418, 'actor/pg_clipfrac': 0.0029922202229499817, 'actor/ppo_kl': 0.002186359139159322}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00021299306536093354, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00023329495161306113, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.08832088857889175, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001673647202551365}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.5787842869758606, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004909112467430532}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.5880802869796753, 'actor/pg_clipfrac': 0.002754820976406336, 'actor/ppo_kl': -0.00047225531307049096}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.1410585641860962, 'actor/pg_clipfrac': 0.0043898154981434345, 'actor/ppo_kl': -0.002971112495288253}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.15205904841423035, 'actor/pg_clipfrac': 0.009641873650252819, 'actor/ppo_kl': -0.0018848965410143137}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.35518819093704224, 'actor/pg_clipfrac': 0.001550387591123581, 'actor/ppo_kl': 0.0013164535630494356}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00027686628163792193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018407206516712904}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00028273992938920856, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001011377782560885}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.2807050049304962, 'actor/pg_clipfrac': 0.0027322403620928526, 'actor/ppo_kl': -0.0011953135253861547}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.36618033051490784, 'actor/pg_clipfrac': 0.0021216408349573612, 'actor/ppo_kl': 0.0003337050729896873}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.09306707978248596, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007571169990114868}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.17565572261810303, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006405328749679029}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00033785580308176577, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008589116623625159}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.1320682317018509, 'actor/pg_clipfrac': 0.0022573363967239857, 'actor/ppo_kl': -0.0014392839511856437}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.15776532888412476, 'actor/pg_clipfrac': 0.002425222424790263, 'actor/ppo_kl': 2.0280805983929895e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.08273951709270477, 'actor/pg_clipfrac': 0.0019047618843615055, 'actor/ppo_kl': -0.0006898910505697131}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00012392183998599648, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00027789673185907304}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.04665470868349075, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000394248723750934}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0003389060730114579, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014270493993535638}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.11313959956169128, 'actor/pg_clipfrac': 0.0008237232104875147, 'actor/ppo_kl': 0.00163273757789284}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.36818400025367737, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008491678163409233}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.05122742801904678, 'actor/pg_clipfrac': 0.0006954103009775281, 'actor/ppo_kl': 0.0001731349912006408}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2195536196231842, 'actor/pg_clipfrac': 0.003442340763285756, 'actor/ppo_kl': -0.0012467921478673816}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.32440564036369324, 'actor/pg_clipfrac': 0.0027829313185065985, 'actor/ppo_kl': -0.0002164203760912642}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.000267029128735885, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00019474857253953815}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.29836305975914, 'actor/pg_clipfrac': 0.0017079418757930398, 'actor/ppo_kl': -0.00019412933033891022}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.32403847575187683, 'actor/pg_clipfrac': 0.0010493178851902485, 'actor/ppo_kl': 0.0014036104548722506}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00035261636367067695, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011260181199759245}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0006675926269963384, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012788402382284403}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.29743245244026184, 'actor/pg_clipfrac': 0.0006064281333237886, 'actor/ppo_kl': -0.0010633821366354823}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.2701018452644348, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0021450100466609}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 1.354417324066162, 'actor/pg_clipfrac': 0.00014647722127847373, 'actor/ppo_kl': 0.00045765165123157203}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.24438108503818512, 'actor/pg_clipfrac': 0.002018842613324523, 'actor/ppo_kl': 0.0018920821603387594}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.12275323271751404, 'actor/pg_clipfrac': 0.0009910803055390716, 'actor/ppo_kl': -0.0007161442190408707}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00042947198380716145, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013899484183639288}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.1836661398410797, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00147626840043813}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.07334303855895996, 'actor/pg_clipfrac': 0.0007836990407668054, 'actor/ppo_kl': 0.0008706283988431096}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4142078757286072, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0026940691750496626}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.15064884722232819, 'actor/pg_clipfrac': 0.001809408888220787, 'actor/ppo_kl': -0.0005133735830895603}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.09301187843084335, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008688821108080447}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00023206297191791236, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009424726013094187}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.2744249403476715, 'actor/pg_clipfrac': 0.00513478834182024, 'actor/ppo_kl': -0.0008806123514659703}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00017607238260097802, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002814031904563308}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.10255169868469238, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013820487074553967}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.44736018776893616, 'actor/pg_clipfrac': 0.0020477815996855497, 'actor/ppo_kl': 0.001341119990684092}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00038158189272508025, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002827107673510909}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.15149566531181335, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.002574412850663066}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.18773749470710754, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017146067693829536}
[36m(Runner pid=3309020)[0m Step 62
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.255
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.022
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:19<1:22:22, 3.88s/it, est. speed input: 118.16 toks/s, output: 23.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<48:42, 2.30s/it, est. speed input: 185.84 toks/s, output: 42.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:28<32:11, 1.53s/it, est. speed input: 242.05 toks/s, output: 64.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<19:41, 1.07it/s, est. speed input: 321.65 toks/s, output: 89.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:29<13:53, 1.51it/s, est. speed input: 390.39 toks/s, output: 107.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:30<10:35, 1.97it/s, est. speed input: 451.78 toks/s, output: 127.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<08:59, 2.31it/s, est. speed input: 503.51 toks/s, output: 146.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:32<06:23, 3.23it/s, est. speed input: 572.25 toks/s, output: 166.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<04:39, 4.41it/s, est. speed input: 642.56 toks/s, output: 192.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<04:18, 4.77it/s, est. speed input: 696.24 toks/s, output: 211.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<04:52, 4.18it/s, est. speed input: 729.40 toks/s, output: 223.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:35<04:31, 4.50it/s, est. speed input: 774.01 toks/s, output: 240.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:36<03:42, 5.47it/s, est. speed input: 827.65 toks/s, output: 259.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<02:45, 7.30it/s, est. speed input: 883.42 toks/s, output: 282.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:12, 9.12it/s, est. speed input: 943.60 toks/s, output: 303.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<02:15, 8.84it/s, est. speed input: 990.03 toks/s, output: 320.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:37<01:05, 18.15it/s, est. speed input: 1174.73 toks/s, output: 388.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<00:58, 20.06it/s, est. speed input: 1232.80 toks/s, output: 411.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:37<01:13, 15.95it/s, est. speed input: 1274.11 toks/s, output: 427.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<01:11, 16.27it/s, est. speed input: 1324.93 toks/s, output: 445.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:38<00:40, 28.86it/s, est. speed input: 1502.21 toks/s, output: 510.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:38, 29.96it/s, est. speed input: 1557.75 toks/s, output: 531.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:35, 31.87it/s, est. speed input: 1666.99 toks/s, output: 577.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:38<00:28, 39.19it/s, est. speed input: 1778.23 toks/s, output: 622.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:39<00:24, 46.08it/s, est. speed input: 1892.38 toks/s, output: 663.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:39<00:38, 29.18it/s, est. speed input: 1976.74 toks/s, output: 705.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:39<00:26, 41.69it/s, est. speed input: 2143.33 toks/s, output: 778.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:24, 43.74it/s, est. speed input: 2243.88 toks/s, output: 820.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:40<00:29, 36.27it/s, est. speed input: 2335.37 toks/s, output: 872.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:40<00:24, 43.27it/s, est. speed input: 2441.65 toks/s, output: 919.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:40<00:28, 37.23it/s, est. speed input: 2533.69 toks/s, output: 967.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:40<00:23, 44.61it/s, est. speed input: 2646.05 toks/s, output: 1016.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:41<00:28, 36.06it/s, est. speed input: 2737.16 toks/s, output: 1060.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:41<00:25, 40.68it/s, est. speed input: 2840.27 toks/s, output: 1111.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:41<00:20, 48.42it/s, est. speed input: 2942.12 toks/s, output: 1171.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:41<00:17, 56.04it/s, est. speed input: 3044.14 toks/s, output: 1221.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:41<00:12, 81.87it/s, est. speed input: 3256.56 toks/s, output: 1327.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:42<00:18, 53.54it/s, est. speed input: 3379.18 toks/s, output: 1397.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:42<00:17, 55.85it/s, est. speed input: 3475.05 toks/s, output: 1450.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:42<00:13, 70.46it/s, est. speed input: 3623.53 toks/s, output: 1521.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:42<00:13, 69.05it/s, est. speed input: 3722.55 toks/s, output: 1567.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:42<00:12, 74.57it/s, est. speed input: 3817.30 toks/s, output: 1618.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:43<00:15, 60.01it/s, est. speed input: 3897.41 toks/s, output: 1666.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:43<00:12, 70.27it/s, est. speed input: 4042.29 toks/s, output: 1743.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:43<00:11, 76.10it/s, est. speed input: 4141.69 toks/s, output: 1800.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:16, 52.82it/s, est. speed input: 4213.83 toks/s, output: 1846.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:43<00:15, 55.76it/s, est. speed input: 4344.20 toks/s, output: 1927.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:44<00:13, 61.89it/s, est. speed input: 4480.61 toks/s, output: 1992.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:44<00:12, 67.52it/s, est. speed input: 4612.44 toks/s, output: 2039.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:16, 48.65it/s, est. speed input: 4676.83 toks/s, output: 2078.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:44<00:16, 50.84it/s, est. speed input: 4759.55 toks/s, output: 2132.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:45<00:16, 49.94it/s, est. speed input: 4835.77 toks/s, output: 2181.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:45<00:16, 49.41it/s, est. speed input: 4914.03 toks/s, output: 2239.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:45<00:10, 71.18it/s, est. speed input: 5104.51 toks/s, output: 2340.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:45<00:14, 53.92it/s, est. speed input: 5172.28 toks/s, output: 2378.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:46<00:15, 50.07it/s, est. speed input: 5247.03 toks/s, output: 2416.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:46<00:14, 52.88it/s, est. speed input: 5326.05 toks/s, output: 2462.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:46<00:12, 59.01it/s, est. speed input: 5409.85 toks/s, output: 2516.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:46<00:11, 64.86it/s, est. speed input: 5495.58 toks/s, output: 2577.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:46<00:09, 76.14it/s, est. speed input: 5627.79 toks/s, output: 2677.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:46<00:08, 80.55it/s, est. speed input: 5710.67 toks/s, output: 2719.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:46<00:07, 89.08it/s, est. speed input: 5846.17 toks/s, output: 2799.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:47<00:08, 83.65it/s, est. speed input: 5959.70 toks/s, output: 2870.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 620/1280 [00:47<00:08, 81.75it/s, est. speed input: 6038.92 toks/s, output: 2913.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:47<00:07, 85.44it/s, est. speed input: 6172.64 toks/s, output: 3005.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:47<00:07, 83.49it/s, est. speed input: 6296.33 toks/s, output: 3088.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:47<00:09, 63.59it/s, est. speed input: 6352.70 toks/s, output: 3117.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:47<00:09, 64.35it/s, est. speed input: 6434.24 toks/s, output: 3175.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:48<00:07, 82.38it/s, est. speed input: 6608.56 toks/s, output: 3290.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 700/1280 [00:48<00:08, 65.10it/s, est. speed input: 6666.63 toks/s, output: 3319.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:48<00:09, 58.16it/s, est. speed input: 6728.07 toks/s, output: 3373.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:48<00:09, 58.09it/s, est. speed input: 6797.80 toks/s, output: 3423.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:48<00:09, 56.04it/s, est. speed input: 6865.61 toks/s, output: 3475.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:49<00:06, 79.15it/s, est. speed input: 7035.85 toks/s, output: 3611.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:49<00:05, 90.29it/s, est. speed input: 7155.80 toks/s, output: 3723.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:49<00:05, 88.11it/s, est. speed input: 7274.51 toks/s, output: 3803.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:49<00:05, 85.11it/s, est. speed input: 7354.10 toks/s, output: 3872.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:49<00:04, 110.20it/s, est. speed input: 7564.22 toks/s, output: 4040.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:49<00:05, 77.01it/s, est. speed input: 7644.44 toks/s, output: 4108.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:50<00:06, 73.18it/s, est. speed input: 7705.57 toks/s, output: 4169.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:50<00:06, 63.99it/s, est. speed input: 7765.49 toks/s, output: 4231.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:50<00:06, 65.74it/s, est. speed input: 7833.16 toks/s, output: 4286.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:50<00:06, 58.52it/s, est. speed input: 7950.44 toks/s, output: 4402.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:50<00:04, 78.56it/s, est. speed input: 8115.98 toks/s, output: 4527.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:51<00:04, 84.54it/s, est. speed input: 8232.54 toks/s, output: 4628.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:51<00:03, 90.25it/s, est. speed input: 8338.62 toks/s, output: 4713.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:51<00:03, 98.41it/s, est. speed input: 8445.58 toks/s, output: 4825.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 960/1280 [00:51<00:03, 95.36it/s, est. speed input: 8545.91 toks/s, output: 4912.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:51<00:02, 114.79it/s, est. speed input: 8716.64 toks/s, output: 5038.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:51<00:02, 118.42it/s, est. speed input: 8915.79 toks/s, output: 5221.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:52<00:01, 135.91it/s, est. speed input: 9119.48 toks/s, output: 5383.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:52<00:01, 123.73it/s, est. speed input: 9223.80 toks/s, output: 5490.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:52<00:02, 89.87it/s, est. speed input: 9297.00 toks/s, output: 5584.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:52<00:02, 96.71it/s, est. speed input: 9402.89 toks/s, output: 5687.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:52<00:01, 114.97it/s, est. speed input: 9561.87 toks/s, output: 5822.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:52<00:01, 121.31it/s, est. speed input: 9676.53 toks/s, output: 5952.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:52<00:01, 125.60it/s, est. speed input: 9788.75 toks/s, output: 6074.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:53<00:01, 88.76it/s, est. speed input: 9866.85 toks/s, output: 6170.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:53<00:01, 96.94it/s, est. speed input: 9979.43 toks/s, output: 6295.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:53<00:01, 94.85it/s, est. speed input: 10081.94 toks/s, output: 6393.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:53<00:01, 88.86it/s, est. speed input: 10179.54 toks/s, output: 6491.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:53<00:01, 70.43it/s, est. speed input: 10220.53 toks/s, output: 6553.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:54<00:01, 66.09it/s, est. speed input: 10265.58 toks/s, output: 6618.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:54<00:01, 45.85it/s, est. speed input: 10269.55 toks/s, output: 6654.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:54<00:01, 45.10it/s, est. speed input: 10310.01 toks/s, output: 6706.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:54<00:00, 52.97it/s, est. speed input: 10374.96 toks/s, output: 6785.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:55<00:00, 42.70it/s, est. speed input: 10396.83 toks/s, output: 6850.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:55<00:00, 34.90it/s, est. speed input: 10406.98 toks/s, output: 6888.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:55<00:00, 36.63it/s, est. speed input: 10429.03 toks/s, output: 6945.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:55<00:00, 33.21it/s, est. speed input: 10433.99 toks/s, output: 6969.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:56<00:00, 19.69it/s, est. speed input: 10356.48 toks/s, output: 6945.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:57<00:00, 9.43it/s, est. speed input: 10139.41 toks/s, output: 6819.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:16<00:00, 1.03s/it, est. speed input: 7699.26 toks/s, output: 5236.54 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:16<00:00, 16.70it/s, est. speed input: 7699.26 toks/s, output: 5236.54 toks/s]
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.016
[36m(Runner pid=3309020)[0m ppo_kl: 2.241731248187051e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.028
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.028
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.677
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.677
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 991232
[36m(Runner pid=3309020)[0m balanced_min: 987867
[36m(Runner pid=3309020)[0m max: 993665
[36m(Runner pid=3309020)[0m mean: 989549.5
[36m(Runner pid=3309020)[0m min: 985434
[36m(Runner pid=3309020)[0m minmax_diff: 8231
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.862
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.076
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.121
[36m(Runner pid=3309020)[0m throughput: 1112.299
[36m(Runner pid=3309020)[0m time_per_step: 889.643
[36m(Runner pid=3309020)[0m total_num_tokens: 1979099
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 814.0
[36m(Runner pid=3309020)[0m mean: 465.916
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 5500.0
[36m(Runner pid=3309020)[0m mean: 307.17
[36m(Runner pid=3309020)[0m min: 64.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.355
[36m(Runner pid=3309020)[0m format: 1.0
[36m(Runner pid=3309020)[0m overall: 0.677
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.226963582067296e-05
[36m(Runner pid=3309020)[0m gen: 0.176
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.286
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.163
[36m(Runner pid=3309020)[0m gen: 138.79
[36m(Runner pid=3309020)[0m old: 88.633
[36m(Runner pid=3309020)[0m ref: 89.424
[36m(Runner pid=3309020)[0m reward: 6.019
[36m(Runner pid=3309020)[0m step: 889.643
[36m(Runner pid=3309020)[0m update_actor: 565.919
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 63; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:02:47 [executor_base.py:219] It took 0.339609 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:04:15 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:02:47 [executor_base.py:219] It took 0.340024 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:04:15 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:04:15 [executor_base.py:208] It took 0.328025 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:04:36 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:04:37 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:04:37 [executor_base.py:208] It took 0.326686 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.35774487257003784, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.13564684987068176, 'actor/pg_clipfrac': 0.0013404826167970896, 'actor/ppo_kl': 0.000518093176651746}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002674528514035046, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00031898703309707344, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.05160241201519966, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.00019338207494001836, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000187591984285973}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.3036114275455475, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.36041662096977234, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.7633287906646729, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.028200261294841766, 'actor/pg_clipfrac': 0.0017452007159590721, 'actor/ppo_kl': 0.0009105009958148003}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.014442788437008858, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.40905824303627014, 'actor/pg_clipfrac': 0.001784121268428862, 'actor/ppo_kl': -0.00018424885638523847}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.4228318929672241, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.215089812874794, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2279708981513977, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.13047181069850922, 'actor/pg_clipfrac': 0.004010695032775402, 'actor/ppo_kl': 0.00018720065418165177}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.033876799046993256, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015224794624373317}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.18683037161827087, 'actor/pg_clipfrac': 0.00200803205370903, 'actor/ppo_kl': -0.0005949574406258762}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.22280284762382507, 'actor/pg_clipfrac': 0.0011750881094485521, 'actor/ppo_kl': 0.00019045021326746792}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.19058294594287872, 'actor/pg_clipfrac': 0.0012048193020746112, 'actor/ppo_kl': 0.00014152067888062447}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.091578409075737, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005029158783145249}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.7306542992591858, 'actor/pg_clipfrac': 0.0032383420038968325, 'actor/ppo_kl': -0.00044927201815880835}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.2579806447029114, 'actor/pg_clipfrac': 0.0009372071363031864, 'actor/ppo_kl': -0.0027895590756088495}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.2744598686695099, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005656776484102011}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0008447633008472621, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012128595262765884}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.09720133244991302, 'actor/pg_clipfrac': 0.0016906170640140772, 'actor/ppo_kl': 0.0005111581413075328}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00014780032506678253, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002443058474455029}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.24649137258529663, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002056868514046073}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0003392933285795152, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008742510108277202}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.06985785067081451, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008713733986951411}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0003199431230314076, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007524688262492418}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002399805380264297, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.397157681523822e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.8373320698738098, 'actor/pg_clipfrac': 0.001251564477570355, 'actor/ppo_kl': -0.0020469443406909704}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.18887798488140106, 'actor/pg_clipfrac': 0.0014245014172047377, 'actor/ppo_kl': -0.001467064954340458}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00037744190194644034, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001497363788075745}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0005331540014594793, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0038233231753110886}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00045727172982878983, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -9.75939619820565e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003147149400319904, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010302969021722674}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.31998082995414734, 'actor/pg_clipfrac': 0.0006215040339156985, 'actor/ppo_kl': -0.0009584728977642953}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2472303956747055, 'actor/pg_clipfrac': 0.0020775622688233852, 'actor/ppo_kl': 0.00030971827800385654}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.23536695539951324, 'actor/pg_clipfrac': 0.004370629321783781, 'actor/ppo_kl': -0.00035636575194075704}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.13653215765953064, 'actor/pg_clipfrac': 0.00046948358067311347, 'actor/ppo_kl': 0.0002758169430308044}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.08797389268875122, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006283161346800625}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1582077443599701, 'actor/pg_clipfrac': 0.004590665455907583, 'actor/ppo_kl': -0.0006644680397585034}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.11654896289110184, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017086165025830269}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.09841462224721909, 'actor/pg_clipfrac': 0.001913875574246049, 'actor/ppo_kl': 0.0007035415037535131}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.12959103286266327, 'actor/pg_clipfrac': 0.002436053706333041, 'actor/ppo_kl': 0.001633222447708249}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00026950519531965256, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00011505457223393023}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.3815937042236328, 'actor/pg_clipfrac': 0.0035180298145860434, 'actor/ppo_kl': 0.0015547105576843023}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00046021543676033616, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005664870841428638}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3019198477268219, 'actor/pg_clipfrac': 0.0007507507689297199, 'actor/ppo_kl': -0.0007063046796247363}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.15495246648788452, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005882072728127241}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0005693614366464317, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002830681041814387}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.37709495425224304, 'actor/pg_clipfrac': 0.005980861373245716, 'actor/ppo_kl': -0.000928700843360275}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.5195658802986145, 'actor/pg_clipfrac': 0.0028943559154868126, 'actor/ppo_kl': -0.0001503355015302077}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.6577258110046387, 'actor/pg_clipfrac': 0.003709198907017708, 'actor/ppo_kl': -0.0011473805643618107}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00022817066928837448, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007187426672317088}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.3336789011955261, 'actor/pg_clipfrac': 0.0023885350674390793, 'actor/ppo_kl': -0.00015080659068189561}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.26412007212638855, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000501699629239738}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.26714321970939636, 'actor/pg_clipfrac': 0.0006249999860301614, 'actor/ppo_kl': -0.001175428624264896}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:17<1:12:33, 3.41s/it, est. speed input: 126.22 toks/s, output: 22.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:24<48:04, 2.27s/it, est. speed input: 182.40 toks/s, output: 39.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:24<26:44, 1.27s/it, est. speed input: 272.80 toks/s, output: 63.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:25<17:05, 1.23it/s, est. speed input: 359.86 toks/s, output: 85.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:32<21:50, 1.04s/it, est. speed input: 349.68 toks/s, output: 90.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:32<14:43, 1.42it/s, est. speed input: 414.24 toks/s, output: 113.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<08:01, 2.57it/s, est. speed input: 532.42 toks/s, output: 160.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<04:56, 4.15it/s, est. speed input: 662.86 toks/s, output: 198.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<04:05, 4.99it/s, est. speed input: 725.10 toks/s, output: 219.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:34<03:13, 6.31it/s, est. speed input: 788.05 toks/s, output: 245.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:34<02:08, 9.45it/s, est. speed input: 914.60 toks/s, output: 291.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:34<01:55, 10.46it/s, est. speed input: 977.25 toks/s, output: 312.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:35<01:24, 14.16it/s, est. speed input: 1156.46 toks/s, output: 375.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:35<01:17, 15.23it/s, est. speed input: 1209.59 toks/s, output: 395.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:36<01:02, 18.89it/s, est. speed input: 1330.31 toks/s, output: 443.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:36<01:06, 17.57it/s, est. speed input: 1383.15 toks/s, output: 464.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:36<00:54, 21.47it/s, est. speed input: 1497.56 toks/s, output: 507.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:37<01:02, 18.33it/s, est. speed input: 1541.35 toks/s, output: 524.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:37<00:43, 26.47it/s, est. speed input: 1711.39 toks/s, output: 592.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:57, 19.61it/s, est. speed input: 1745.69 toks/s, output: 610.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:38<00:50, 22.20it/s, est. speed input: 1851.25 toks/s, output: 652.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:38<00:49, 22.66it/s, est. speed input: 1897.98 toks/s, output: 676.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:38<00:32, 33.78it/s, est. speed input: 2118.33 toks/s, output: 772.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:39<00:22, 47.30it/s, est. speed input: 2340.03 toks/s, output: 854.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:39<00:21, 48.69it/s, est. speed input: 2448.53 toks/s, output: 903.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:39<00:22, 46.73it/s, est. speed input: 2556.08 toks/s, output: 950.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:39<00:27, 37.55it/s, est. speed input: 2641.07 toks/s, output: 995.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:40<00:29, 35.77it/s, est. speed input: 2685.18 toks/s, output: 1011.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:40<00:23, 43.75it/s, est. speed input: 2790.21 toks/s, output: 1062.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:40<00:21, 47.30it/s, est. speed input: 2891.07 toks/s, output: 1111.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:40<00:18, 55.05it/s, est. speed input: 2994.55 toks/s, output: 1167.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:40<00:19, 51.27it/s, est. speed input: 3090.56 toks/s, output: 1209.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:41<00:25, 39.34it/s, est. speed input: 3173.72 toks/s, output: 1242.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:41<00:24, 40.26it/s, est. speed input: 3219.89 toks/s, output: 1267.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:41<00:16, 58.03it/s, est. speed input: 3381.29 toks/s, output: 1349.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:41<00:17, 53.76it/s, est. speed input: 3524.28 toks/s, output: 1424.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:41<00:19, 48.59it/s, est. speed input: 3608.71 toks/s, output: 1474.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:42<00:16, 56.42it/s, est. speed input: 3715.87 toks/s, output: 1521.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:42<00:18, 50.00it/s, est. speed input: 3800.28 toks/s, output: 1567.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:42<00:18, 49.65it/s, est. speed input: 3887.45 toks/s, output: 1616.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:42<00:22, 40.94it/s, est. speed input: 3968.38 toks/s, output: 1663.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:42<00:22, 39.53it/s, est. speed input: 4007.71 toks/s, output: 1684.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:43<00:23, 38.46it/s, est. speed input: 4047.55 toks/s, output: 1704.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:43<00:16, 53.90it/s, est. speed input: 4237.82 toks/s, output: 1803.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:43<00:14, 58.20it/s, est. speed input: 4335.46 toks/s, output: 1850.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:43<00:13, 62.31it/s, est. speed input: 4425.22 toks/s, output: 1900.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:43<00:15, 53.92it/s, est. speed input: 4553.64 toks/s, output: 1965.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:44<00:17, 47.00it/s, est. speed input: 4625.27 toks/s, output: 1999.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:44<00:17, 47.60it/s, est. speed input: 4704.73 toks/s, output: 2045.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:44<00:13, 61.87it/s, est. speed input: 4841.78 toks/s, output: 2129.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:44<00:09, 83.83it/s, est. speed input: 5042.69 toks/s, output: 2245.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:44<00:07, 103.24it/s, est. speed input: 5238.04 toks/s, output: 2354.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:45<00:10, 70.39it/s, est. speed input: 5348.28 toks/s, output: 2422.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:45<00:14, 51.90it/s, est. speed input: 5404.59 toks/s, output: 2473.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:45<00:12, 58.31it/s, est. speed input: 5495.12 toks/s, output: 2533.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:45<00:11, 64.94it/s, est. speed input: 5582.47 toks/s, output: 2599.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:45<00:08, 80.86it/s, est. speed input: 5726.98 toks/s, output: 2687.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:46<00:08, 84.44it/s, est. speed input: 5858.48 toks/s, output: 2764.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:46<00:07, 87.23it/s, est. speed input: 5988.44 toks/s, output: 2839.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:46<00:07, 91.61it/s, est. speed input: 6243.00 toks/s, output: 3010.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:46<00:06, 96.95it/s, est. speed input: 6414.75 toks/s, output: 3115.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:46<00:05, 101.88it/s, est. speed input: 6592.84 toks/s, output: 3231.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:46<00:04, 119.77it/s, est. speed input: 6818.65 toks/s, output: 3369.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 710/1280 [00:47<00:04, 120.29it/s, est. speed input: 6956.11 toks/s, output: 3459.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:47<00:04, 120.83it/s, est. speed input: 7087.32 toks/s, output: 3551.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:47<00:04, 121.56it/s, est. speed input: 7215.07 toks/s, output: 3619.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:47<00:06, 80.73it/s, est. speed input: 7306.72 toks/s, output: 3657.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:47<00:06, 76.74it/s, est. speed input: 7376.76 toks/s, output: 3713.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:47<00:05, 88.66it/s, est. speed input: 7505.04 toks/s, output: 3810.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:48<00:05, 85.29it/s, est. speed input: 7619.96 toks/s, output: 3916.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:48<00:05, 84.78it/s, est. speed input: 7697.99 toks/s, output: 3970.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:48<00:06, 72.75it/s, est. speed input: 7755.62 toks/s, output: 4010.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:48<00:09, 48.91it/s, est. speed input: 7794.26 toks/s, output: 4034.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:49<00:06, 69.87it/s, est. speed input: 7963.47 toks/s, output: 4147.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:49<00:04, 89.75it/s, est. speed input: 8142.19 toks/s, output: 4269.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:49<00:04, 94.99it/s, est. speed input: 8279.53 toks/s, output: 4372.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:49<00:04, 92.93it/s, est. speed input: 8383.58 toks/s, output: 4455.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:49<00:04, 91.05it/s, est. speed input: 8487.25 toks/s, output: 4552.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:49<00:03, 95.13it/s, est. speed input: 8603.64 toks/s, output: 4642.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:49<00:03, 93.50it/s, est. speed input: 8714.02 toks/s, output: 4729.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:50<00:03, 89.87it/s, est. speed input: 8817.98 toks/s, output: 4846.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:50<00:03, 85.65it/s, est. speed input: 8889.56 toks/s, output: 4913.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:50<00:02, 107.94it/s, est. speed input: 9061.66 toks/s, output: 5060.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:50<00:02, 126.57it/s, est. speed input: 9228.28 toks/s, output: 5187.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:50<00:01, 166.76it/s, est. speed input: 9488.64 toks/s, output: 5409.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:50<00:01, 184.86it/s, est. speed input: 9744.56 toks/s, output: 5599.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 1085/1280 [00:50<00:01, 136.58it/s, est. speed input: 9878.15 toks/s, output: 5762.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:51<00:01, 151.18it/s, est. speed input: 10080.31 toks/s, output: 5956.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:51<00:01, 120.10it/s, est. speed input: 10212.04 toks/s, output: 6078.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:51<00:01, 118.60it/s, est. speed input: 10322.67 toks/s, output: 6216.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:51<00:00, 123.17it/s, est. speed input: 10468.13 toks/s, output: 6374.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:51<00:00, 113.96it/s, est. speed input: 10568.21 toks/s, output: 6499.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:51<00:00, 122.50it/s, est. speed input: 10715.41 toks/s, output: 6642.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:52<00:00, 124.26it/s, est. speed input: 10825.89 toks/s, output: 6745.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:52<00:00, 86.48it/s, est. speed input: 10893.96 toks/s, output: 6823.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:53<00:00, 42.95it/s, est. speed input: 10863.19 toks/s, output: 6869.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:53<00:00, 35.25it/s, est. speed input: 10855.07 toks/s, output: 6920.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:55<00:01, 14.39it/s, est. speed input: 10528.48 toks/s, output: 6749.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:55<00:00, 15.83it/s, est. speed input: 10548.77 toks/s, output: 6798.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:11<00:00, 15.83it/s, est. speed input: 10580.87 toks/s, output: 6846.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:28<00:00, 1.02it/s, est. speed input: 6714.50 toks/s, output: 4410.65 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:28<00:00, 14.48it/s, est. speed input: 6714.50 toks/s, output: 4410.65 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.1549580991268158, 'actor/pg_clipfrac': 0.0028922632336616516, 'actor/ppo_kl': -1.0495244850972085e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.000483927084133029, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00177624577190727}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.04898355156183243, 'actor/pg_clipfrac': 0.000841042899992317, 'actor/ppo_kl': 0.0010225714650005102}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.06960663199424744, 'actor/pg_clipfrac': 0.0019047618843615055, 'actor/ppo_kl': 0.0018770998576655984}
[36m(Runner pid=3309020)[0m Step 63
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.31
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.024
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.007
[36m(Runner pid=3309020)[0m ppo_kl: 4.847839536914478e-06
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.012
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.012
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.681
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.681
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 989931
[36m(Runner pid=3309020)[0m balanced_min: 987930
[36m(Runner pid=3309020)[0m max: 991440
[36m(Runner pid=3309020)[0m mean: 988930.5
[36m(Runner pid=3309020)[0m min: 986421
[36m(Runner pid=3309020)[0m minmax_diff: 5019
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 112.184
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.076
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.121
[36m(Runner pid=3309020)[0m throughput: 1122.65
[36m(Runner pid=3309020)[0m time_per_step: 880.889
[36m(Runner pid=3309020)[0m total_num_tokens: 1977861
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 661.0
[36m(Runner pid=3309020)[0m mean: 463.062
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3792.0
[36m(Runner pid=3309020)[0m mean: 309.539
[36m(Runner pid=3309020)[0m min: 66.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.363
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.681
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.159
[36m(Runner pid=3309020)[0m old: 0.046
[36m(Runner pid=3309020)[0m ref: 0.046
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.286
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.2
[36m(Runner pid=3309020)[0m gen: 125.636
[36m(Runner pid=3309020)[0m old: 90.513
[36m(Runner pid=3309020)[0m ref: 91.757
[36m(Runner pid=3309020)[0m reward: 6.168
[36m(Runner pid=3309020)[0m step: 880.889
[36m(Runner pid=3309020)[0m update_actor: 565.997
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 64; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:17:28 [executor_base.py:219] It took 0.342028 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.54 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:19:06 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:17:28 [executor_base.py:219] It took 0.342125 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:19:07 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:19:07 [executor_base.py:208] It took 0.329191 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.72 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:19:30 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:19:30 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.80 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:19:30 [executor_base.py:208] It took 0.326081 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.80 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.7775701880455017, 'actor/pg_clipfrac': 0.002657218836247921, 'actor/ppo_kl': 0.0007026357925496995}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.11416027694940567, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008977264515124261}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.15326061844825745, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.2944149076938629, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0006477280403487384, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0003343367134220898, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.03100285679101944, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002433997142361477, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00026552710914984345, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.29144686460494995, 'actor/pg_clipfrac': 0.0011520737316459417, 'actor/ppo_kl': 0.00033275980968028307}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0005411933525465429, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010908416006714106}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.1948745846748352, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.37064406275749207, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.33121243119239807, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006993722636252642}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.6568654775619507, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.4979872405529022, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.000230822988669388, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.192797056632116e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.7837845683097839, 'actor/pg_clipfrac': 0.0008278145687654614, 'actor/ppo_kl': -0.0010881645139306784}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.33374443650245667, 'actor/pg_clipfrac': 0.0012507817009463906, 'actor/ppo_kl': 0.0005237765144556761}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0002862393157556653, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012153920251876116}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.1639014035463333, 'actor/pg_clipfrac': 0.0008250825339928269, 'actor/ppo_kl': -1.1206459930690471e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4476524293422699, 'actor/pg_clipfrac': 0.000866551126819104, 'actor/ppo_kl': 0.0004708886845037341}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 1.3449621200561523, 'actor/pg_clipfrac': 0.00043712661135941744, 'actor/ppo_kl': -0.0001584598794579506}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.14579623937606812, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00020464022236410528}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3485735058784485, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014050157042220235}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.022330887615680695, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.807587826391682e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.10352735966444016, 'actor/pg_clipfrac': 0.001377410488203168, 'actor/ppo_kl': 0.0013761743903160095}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.038593001663684845, 'actor/pg_clipfrac': 0.0023885350674390793, 'actor/ppo_kl': -7.177158840931952e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.7698052525520325, 'actor/pg_clipfrac': 0.0018744142726063728, 'actor/ppo_kl': -0.0021153355482965708}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.24589703977108002, 'actor/pg_clipfrac': 0.002822865266352892, 'actor/ppo_kl': -1.933461862790864e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0006337895756587386, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002180510200560093}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002780029608402401, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008039638632908463}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.07681266963481903, 'actor/pg_clipfrac': 0.0016722407890483737, 'actor/ppo_kl': 0.0012291617458686233}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.06471619755029678, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005747477407567203}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00037500407779589295, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00030484513263218105}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.2498055249452591, 'actor/pg_clipfrac': 0.0018248175038024783, 'actor/ppo_kl': 0.0019180409144610167}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.24719007313251495, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00041002771467901766}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0609043687582016, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -2.5745306629687548e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.023668991401791573, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000247558462433517}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2731655538082123, 'actor/pg_clipfrac': 0.0007674596854485571, 'actor/ppo_kl': 0.00024356241920031607}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00024111206585075706, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002282410132465884}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0003795711963903159, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0017076723743230104}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.15507927536964417, 'actor/pg_clipfrac': 0.0016155089251697063, 'actor/ppo_kl': 0.0010752762900665402}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00015518158033955842, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006204304518178105}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.6050390005111694, 'actor/pg_clipfrac': 0.0016339869471266866, 'actor/ppo_kl': -0.0006762174307368696}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.00023040416999720037, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002684023231267929}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.10219915211200714, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006407205946743488}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002164982579415664, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007721330039203167}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0005250132526271045, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006831990904174745}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00026621611323207617, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011397793423384428}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.2502812147140503, 'actor/pg_clipfrac': 0.0042553190141916275, 'actor/ppo_kl': -0.002370943082496524}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0005223298794589937, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002490670420229435}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.09809087216854095, 'actor/pg_clipfrac': 0.002479338785633445, 'actor/ppo_kl': -0.0027222137432545424}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.000644384475890547, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007245648303069174}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:36:34, 4.54s/it, est. speed input: 100.56 toks/s, output: 27.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<40:32, 1.92s/it, est. speed input: 199.59 toks/s, output: 54.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:24<24:51, 1.18s/it, est. speed input: 282.42 toks/s, output: 73.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<21:03, 1.00s/it, est. speed input: 327.39 toks/s, output: 88.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<14:19, 1.46it/s, est. speed input: 396.69 toks/s, output: 114.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<12:56, 1.61it/s, est. speed input: 436.79 toks/s, output: 126.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<08:58, 2.31it/s, est. speed input: 506.97 toks/s, output: 149.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:32<06:53, 3.00it/s, est. speed input: 568.33 toks/s, output: 168.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:32<05:00, 4.11it/s, est. speed input: 636.65 toks/s, output: 188.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:33<04:38, 4.41it/s, est. speed input: 689.55 toks/s, output: 207.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<05:56, 3.44it/s, est. speed input: 709.95 toks/s, output: 217.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:36<03:35, 5.63it/s, est. speed input: 839.44 toks/s, output: 258.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:15, 8.86it/s, est. speed input: 963.75 toks/s, output: 305.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:36<01:53, 10.55it/s, est. speed input: 1022.82 toks/s, output: 329.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:36<01:23, 14.26it/s, est. speed input: 1142.44 toks/s, output: 367.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:37<01:09, 16.94it/s, est. speed input: 1253.91 toks/s, output: 407.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:37<01:01, 19.12it/s, est. speed input: 1372.09 toks/s, output: 452.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:37<00:59, 19.59it/s, est. speed input: 1425.20 toks/s, output: 473.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:47, 23.97it/s, est. speed input: 1584.99 toks/s, output: 537.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:38<00:43, 26.26it/s, est. speed input: 1687.65 toks/s, output: 575.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:38<00:56, 20.19it/s, est. speed input: 1722.68 toks/s, output: 588.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:39<01:01, 18.42it/s, est. speed input: 1810.47 toks/s, output: 631.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:53, 20.86it/s, est. speed input: 1903.16 toks/s, output: 677.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:40<01:03, 17.56it/s, est. speed input: 1932.60 toks/s, output: 695.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:40<00:45, 23.99it/s, est. speed input: 2042.08 toks/s, output: 752.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:41<01:03, 17.27it/s, est. speed input: 2108.46 toks/s, output: 775.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:41<00:55, 19.48it/s, est. speed input: 2159.57 toks/s, output: 799.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:41<00:49, 21.97it/s, est. speed input: 2210.49 toks/s, output: 825.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:42<00:58, 18.33it/s, est. speed input: 2245.49 toks/s, output: 839.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:42<00:53, 20.10it/s, est. speed input: 2291.88 toks/s, output: 860.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:42<00:58, 18.14it/s, est. speed input: 2326.06 toks/s, output: 884.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:42<00:55, 19.01it/s, est. speed input: 2369.60 toks/s, output: 907.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:43<00:38, 26.95it/s, est. speed input: 2468.37 toks/s, output: 960.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:43<00:28, 36.55it/s, est. speed input: 2588.04 toks/s, output: 1012.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:43<00:27, 37.77it/s, est. speed input: 2638.06 toks/s, output: 1043.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:43<00:26, 38.85it/s, est. speed input: 2683.41 toks/s, output: 1069.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:43<00:18, 53.96it/s, est. speed input: 2831.09 toks/s, output: 1131.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:43<00:21, 47.16it/s, est. speed input: 2965.33 toks/s, output: 1182.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:44<00:19, 50.73it/s, est. speed input: 3060.19 toks/s, output: 1225.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:44<00:18, 53.81it/s, est. speed input: 3152.21 toks/s, output: 1278.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:44<00:17, 56.51it/s, est. speed input: 3243.89 toks/s, output: 1322.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:44<00:14, 64.63it/s, est. speed input: 3342.79 toks/s, output: 1375.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:44<00:17, 54.24it/s, est. speed input: 3426.99 toks/s, output: 1421.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:44<00:13, 71.67it/s, est. speed input: 3578.27 toks/s, output: 1500.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 355/1280 [00:45<00:15, 59.70it/s, est. speed input: 3663.98 toks/s, output: 1556.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:45<00:14, 61.70it/s, est. speed input: 3757.52 toks/s, output: 1605.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:45<00:13, 65.00it/s, est. speed input: 3845.88 toks/s, output: 1657.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:45<00:14, 62.25it/s, est. speed input: 3925.62 toks/s, output: 1698.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:45<00:10, 84.90it/s, est. speed input: 4113.47 toks/s, output: 1804.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:46<00:11, 72.29it/s, est. speed input: 4284.90 toks/s, output: 1888.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:46<00:10, 81.44it/s, est. speed input: 4415.07 toks/s, output: 1968.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:46<00:09, 90.02it/s, est. speed input: 4558.70 toks/s, output: 2056.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:46<00:07, 101.27it/s, est. speed input: 4783.89 toks/s, output: 2191.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:46<00:07, 106.22it/s, est. speed input: 4917.50 toks/s, output: 2277.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:46<00:08, 89.41it/s, est. speed input: 5057.56 toks/s, output: 2366.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:47<00:08, 90.17it/s, est. speed input: 5183.11 toks/s, output: 2443.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:47<00:06, 119.97it/s, est. speed input: 5430.70 toks/s, output: 2589.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:47<00:07, 96.29it/s, est. speed input: 5555.43 toks/s, output: 2657.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:47<00:07, 87.47it/s, est. speed input: 5714.15 toks/s, output: 2758.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:47<00:09, 73.93it/s, est. speed input: 5819.50 toks/s, output: 2826.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:48<00:08, 74.93it/s, est. speed input: 5904.54 toks/s, output: 2895.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:48<00:08, 79.69it/s, est. speed input: 6026.91 toks/s, output: 2971.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:48<00:07, 83.87it/s, est. speed input: 6146.00 toks/s, output: 3041.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:48<00:08, 75.06it/s, est. speed input: 6221.36 toks/s, output: 3093.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:48<00:07, 82.32it/s, est. speed input: 6383.07 toks/s, output: 3213.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:49<00:09, 65.71it/s, est. speed input: 6448.67 toks/s, output: 3257.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:49<00:07, 75.39it/s, est. speed input: 6568.27 toks/s, output: 3342.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:49<00:07, 79.50it/s, est. speed input: 6650.14 toks/s, output: 3399.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:49<00:09, 61.45it/s, est. speed input: 6709.22 toks/s, output: 3446.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:49<00:07, 77.00it/s, est. speed input: 6871.45 toks/s, output: 3577.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:49<00:06, 78.66it/s, est. speed input: 6950.04 toks/s, output: 3634.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:50<00:08, 63.41it/s, est. speed input: 7009.04 toks/s, output: 3668.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:50<00:06, 83.55it/s, est. speed input: 7170.48 toks/s, output: 3796.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:50<00:04, 104.76it/s, est. speed input: 7330.78 toks/s, output: 3946.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:50<00:05, 79.13it/s, est. speed input: 7425.34 toks/s, output: 4019.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:50<00:04, 91.44it/s, est. speed input: 7546.95 toks/s, output: 4115.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:50<00:04, 89.51it/s, est. speed input: 7660.54 toks/s, output: 4191.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:51<00:04, 97.47it/s, est. speed input: 7778.50 toks/s, output: 4295.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:51<00:03, 120.22it/s, est. speed input: 8000.99 toks/s, output: 4449.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:51<00:02, 132.31it/s, est. speed input: 8160.12 toks/s, output: 4563.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:51<00:02, 131.66it/s, est. speed input: 8280.81 toks/s, output: 4664.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:51<00:03, 88.26it/s, est. speed input: 8361.38 toks/s, output: 4766.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:51<00:04, 82.35it/s, est. speed input: 8461.44 toks/s, output: 4861.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:52<00:03, 99.61it/s, est. speed input: 8616.15 toks/s, output: 5002.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:52<00:02, 116.17it/s, est. speed input: 8822.49 toks/s, output: 5123.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:52<00:02, 114.01it/s, est. speed input: 8929.15 toks/s, output: 5222.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:52<00:02, 101.25it/s, est. speed input: 9030.81 toks/s, output: 5312.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:52<00:02, 93.82it/s, est. speed input: 9131.19 toks/s, output: 5429.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:52<00:02, 103.75it/s, est. speed input: 9245.03 toks/s, output: 5535.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:52<00:01, 112.71it/s, est. speed input: 9365.38 toks/s, output: 5643.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:53<00:01, 105.37it/s, est. speed input: 9469.27 toks/s, output: 5738.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:53<00:01, 138.91it/s, est. speed input: 9701.80 toks/s, output: 5953.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:53<00:01, 104.14it/s, est. speed input: 9782.39 toks/s, output: 6074.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:53<00:01, 107.68it/s, est. speed input: 9936.45 toks/s, output: 6188.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:53<00:01, 106.46it/s, est. speed input: 10045.52 toks/s, output: 6317.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:54<00:01, 79.76it/s, est. speed input: 10118.51 toks/s, output: 6395.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:54<00:00, 107.07it/s, est. speed input: 10311.76 toks/s, output: 6611.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:54<00:00, 79.91it/s, est. speed input: 10377.14 toks/s, output: 6714.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:54<00:00, 58.11it/s, est. speed input: 10423.56 toks/s, output: 6798.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:55<00:00, 54.25it/s, est. speed input: 10463.04 toks/s, output: 6870.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:55<00:00, 36.50it/s, est. speed input: 10437.85 toks/s, output: 6884.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:55<00:00, 40.45it/s, est. speed input: 10495.03 toks/s, output: 6980.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:57<00:00, 22.09it/s, est. speed input: 10377.89 toks/s, output: 6954.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:57<00:00, 25.77it/s, est. speed input: 10418.60 toks/s, output: 7032.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 20.05it/s, est. speed input: 10361.10 toks/s, output: 7029.42 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:57<00:00, 22.16it/s, est. speed input: 10361.10 toks/s, output: 7029.42 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.21167109906673431, 'actor/pg_clipfrac': 0.0022727272007614374, 'actor/ppo_kl': 0.001069618039764464}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0003106749791186303, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002575249527581036}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0006353760836645961, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016627967124804854}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0005200027953833342, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0020547588355839252}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.21331261098384857, 'actor/pg_clipfrac': 0.0011890606256201863, 'actor/ppo_kl': 0.00046903768088668585}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.4448014795780182, 'actor/pg_clipfrac': 0.000859845255035907, 'actor/ppo_kl': 0.0010259797563776374}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.07056873291730881, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013656328665092587}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.30904895067214966, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011894956696778536}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00030268487171269953, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015350875910371542}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0004549327422864735, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014236870920285583}
[36m(Runner pid=3309020)[0m Step 64
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.255
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.023
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.02
[36m(Runner pid=3309020)[0m ppo_kl: 4.6499907101349434e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.039
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.039
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.684
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.684
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 995138
[36m(Runner pid=3309020)[0m balanced_min: 992627
[36m(Runner pid=3309020)[0m max: 1004031
[36m(Runner pid=3309020)[0m mean: 993882.5
[36m(Runner pid=3309020)[0m min: 983734
[36m(Runner pid=3309020)[0m minmax_diff: 20297
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 112.104
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.137
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.122
[36m(Runner pid=3309020)[0m throughput: 1121.346
[36m(Runner pid=3309020)[0m time_per_step: 886.33
[36m(Runner pid=3309020)[0m total_num_tokens: 1987765
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 693.0
[36m(Runner pid=3309020)[0m mean: 464.543
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 5500.0
[36m(Runner pid=3309020)[0m mean: 311.928
[36m(Runner pid=3309020)[0m min: 55.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.37
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.684
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.950328734025279e-05
[36m(Runner pid=3309020)[0m gen: 0.171
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.284
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.178
[36m(Runner pid=3309020)[0m gen: 136.449
[36m(Runner pid=3309020)[0m old: 88.027
[36m(Runner pid=3309020)[0m ref: 88.961
[36m(Runner pid=3309020)[0m reward: 6.691
[36m(Runner pid=3309020)[0m step: 886.33
[36m(Runner pid=3309020)[0m update_actor: 565.404
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 65; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.71 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:32:15 [executor_base.py:219] It took 0.342991 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.62 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.79 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:32:15 [executor_base.py:219] It took 0.342977 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:33:44 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:33:44 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.87 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:33:44 [executor_base.py:208] It took 0.325932 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.87 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:33:47 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:33:48 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:33:48 [executor_base.py:208] It took 0.327575 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.04376417398452759, 'actor/pg_clipfrac': 0.0005076142260804772, 'actor/ppo_kl': -3.558996468200348e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0004509019781835377, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001068067504093051}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.3196883797645569, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.45073795318603516, 'actor/pg_clipfrac': 0.0013568521244451404, 'actor/ppo_kl': 0.0010993833420798182}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.5956810116767883, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.2846299707889557, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.12695573270320892, 'actor/pg_clipfrac': 0.003699136897921562, 'actor/ppo_kl': -0.0002830049197655171}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.09910497814416885, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2832934260368347, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00041782722109928727, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.48169592022895813, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.26241159439086914, 'actor/pg_clipfrac': 0.002154011745005846, 'actor/ppo_kl': -0.00048370033618994057}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00029387493850663304, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.2768241763114929, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0004378550802357495, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.17601637542247772, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00018884653400164098}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00026572158094495535, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00010461059719091281}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00019998948846478015, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004963772371411324}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.2094147950410843, 'actor/pg_clipfrac': 0.0021843600552529097, 'actor/ppo_kl': 0.00038245716132223606}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.11706230789422989, 'actor/pg_clipfrac': 0.0010593220358714461, 'actor/ppo_kl': -0.0005734411533921957}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.20848119258880615, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007142607355490327}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.19043751060962677, 'actor/pg_clipfrac': 0.0007102272938936949, 'actor/ppo_kl': 9.938939183484763e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3388758599758148, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014982310822233558}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.07475821673870087, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00046495560673065484}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.17311476171016693, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007078704074956477}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.1353897899389267, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00124008406419307}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.1133623793721199, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001119151245802641}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00025551076396368444, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009571126429364085}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.20644021034240723, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014141175197437406}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.10916820168495178, 'actor/pg_clipfrac': 0.001580611220560968, 'actor/ppo_kl': -0.00034677868825383484}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.4471728801727295, 'actor/pg_clipfrac': 0.001682085799984634, 'actor/ppo_kl': 1.605605757504236e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0006099320598877966, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010547578567638993}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0003157255705446005, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002980241843033582}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2828781306743622, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016891451086848974}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002762829535640776, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00010184576967731118}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00030469329794868827, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016585076227784157}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.07811523973941803, 'actor/pg_clipfrac': 0.0008340283529832959, 'actor/ppo_kl': 0.00031842858879826963}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.6896860599517822, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015942450845614076}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.1689317673444748, 'actor/pg_clipfrac': 0.0007267441833391786, 'actor/ppo_kl': 0.0014783268561586738}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002499183756299317, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000522961316164583}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.11024507135152817, 'actor/pg_clipfrac': 0.0006734006456099451, 'actor/ppo_kl': 0.0002905309374909848}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.21619904041290283, 'actor/pg_clipfrac': 0.001221747137606144, 'actor/ppo_kl': -0.0002035421784967184}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.12887735664844513, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001372472761431709}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.11045396327972412, 'actor/pg_clipfrac': 0.0031847134232521057, 'actor/ppo_kl': -0.00021252359147183597}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0005669139791280031, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00025405982160009444}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.17521566152572632, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012808659812435508}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.8309075832366943, 'actor/pg_clipfrac': 0.000425713078584522, 'actor/ppo_kl': 0.00018673503655008972}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.28406330943107605, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013544930843636394}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:15<1:38:38, 15.74s/it, est. speed input: 29.54 toks/s, output: 5.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<40:59, 6.56s/it, est. speed input: 57.65 toks/s, output: 12.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:16<22:35, 3.62s/it, est. speed input: 86.67 toks/s, output: 19.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 7/377 [00:16<06:37, 1.08s/it, est. speed input: 197.20 toks/s, output: 47.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 11/377 [00:16<03:22, 1.80it/s, est. speed input: 304.65 toks/s, output: 74.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 16/377 [00:16<01:50, 3.27it/s, est. speed input: 437.00 toks/s, output: 111.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 20/377 [00:16<01:14, 4.77it/s, est. speed input: 544.15 toks/s, output: 141.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 26/377 [00:17<00:46, 7.63it/s, est. speed input: 702.64 toks/s, output: 187.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 29/377 [00:17<00:38, 9.07it/s, est. speed input: 780.25 toks/s, output: 209.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 33/377 [00:17<00:29, 11.63it/s, est. speed input: 883.65 toks/s, output: 241.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 40/377 [00:17<00:19, 17.72it/s, est. speed input: 1062.59 toks/s, output: 299.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 44/377 [00:17<00:17, 19.21it/s, est. speed input: 1158.43 toks/s, output: 332.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 48/377 [00:17<00:19, 16.89it/s, est. speed input: 1243.86 toks/s, output: 361.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 52/377 [00:18<00:17, 18.73it/s, est. speed input: 1335.68 toks/s, output: 395.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 56/377 [00:18<00:15, 21.33it/s, est. speed input: 1429.22 toks/s, output: 430.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 63/377 [00:18<00:10, 28.94it/s, est. speed input: 1596.51 toks/s, output: 494.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 71/377 [00:18<00:08, 37.55it/s, est. speed input: 1791.73 toks/s, output: 568.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 76/377 [00:18<00:08, 34.38it/s, est. speed input: 1911.81 toks/s, output: 612.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 82/377 [00:18<00:07, 38.09it/s, est. speed input: 2046.15 toks/s, output: 669.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 87/377 [00:18<00:07, 39.22it/s, est. speed input: 2156.55 toks/s, output: 716.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 96/377 [00:18<00:05, 49.17it/s, est. speed input: 2363.18 toks/s, output: 806.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 103/377 [00:19<00:05, 52.49it/s, est. speed input: 2518.17 toks/s, output: 875.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 109/377 [00:19<00:05, 46.26it/s, est. speed input: 2641.84 toks/s, output: 931.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 116/377 [00:19<00:05, 47.47it/s, est. speed input: 2792.70 toks/s, output: 1000.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 121/377 [00:19<00:05, 46.95it/s, est. speed input: 2893.55 toks/s, output: 1050.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 131/377 [00:19<00:04, 57.05it/s, est. speed input: 3112.19 toks/s, output: 1156.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 137/377 [00:19<00:04, 52.10it/s, est. speed input: 3232.85 toks/s, output: 1217.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 145/377 [00:19<00:04, 56.36it/s, est. speed input: 3403.82 toks/s, output: 1302.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 153/377 [00:20<00:03, 56.81it/s, est. speed input: 3563.98 toks/s, output: 1387.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 159/377 [00:20<00:04, 47.79it/s, est. speed input: 3672.20 toks/s, output: 1446.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 166/377 [00:20<00:04, 49.27it/s, est. speed input: 3809.32 toks/s, output: 1521.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 177/377 [00:20<00:03, 61.76it/s, est. speed input: 4039.08 toks/s, output: 1650.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 184/377 [00:20<00:03, 52.74it/s, est. speed input: 4160.54 toks/s, output: 1723.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 190/377 [00:20<00:03, 49.94it/s, est. speed input: 4266.50 toks/s, output: 1789.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 198/377 [00:20<00:03, 54.40it/s, est. speed input: 4427.12 toks/s, output: 1885.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▍ | 206/377 [00:21<00:03, 55.47it/s, est. speed input: 4578.79 toks/s, output: 1980.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 216/377 [00:21<00:02, 63.34it/s, est. speed input: 4778.55 toks/s, output: 2105.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 223/377 [00:21<00:02, 62.07it/s, est. speed input: 4910.04 toks/s, output: 2191.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 230/377 [00:21<00:02, 58.78it/s, est. speed input: 5034.31 toks/s, output: 2277.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 237/377 [00:21<00:02, 59.57it/s, est. speed input: 5160.87 toks/s, output: 2366.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 246/377 [00:21<00:02, 62.30it/s, est. speed input: 5328.34 toks/s, output: 2483.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 256/377 [00:21<00:01, 69.39it/s, est. speed input: 5519.31 toks/s, output: 2618.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 264/377 [00:21<00:01, 59.15it/s, est. speed input: 5646.40 toks/s, output: 2717.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 273/377 [00:22<00:01, 64.17it/s, est. speed input: 5808.85 toks/s, output: 2843.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 281/377 [00:22<00:01, 64.44it/s, est. speed input: 5948.23 toks/s, output: 2952.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 289/377 [00:22<00:01, 67.06it/s, est. speed input: 6092.40 toks/s, output: 3068.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 296/377 [00:22<00:01, 54.47it/s, est. speed input: 6192.58 toks/s, output: 3157.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 302/377 [00:22<00:01, 50.69it/s, est. speed input: 6279.49 toks/s, output: 3239.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 308/377 [00:22<00:01, 51.03it/s, est. speed input: 6373.01 toks/s, output: 3327.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 314/377 [00:22<00:01, 51.92it/s, est. speed input: 6468.12 toks/s, output: 3416.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▍ | 320/377 [00:23<00:01, 38.60it/s, est. speed input: 6518.62 toks/s, output: 3487.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 330/377 [00:23<00:01, 45.34it/s, est. speed input: 6680.09 toks/s, output: 3649.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 335/377 [00:23<00:00, 42.27it/s, est. speed input: 6739.95 toks/s, output: 3723.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 340/377 [00:23<00:00, 38.52it/s, est. speed input: 6798.91 toks/s, output: 3797.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 345/377 [00:23<00:00, 40.00it/s, est. speed input: 6868.88 toks/s, output: 3882.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 350/377 [00:24<00:01, 24.07it/s, est. speed input: 6845.77 toks/s, output: 3918.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 354/377 [00:24<00:01, 18.36it/s, est. speed input: 6814.35 toks/s, output: 3946.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 359/377 [00:24<00:00, 21.25it/s, est. speed input: 6871.81 toks/s, output: 4040.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 362/377 [00:24<00:00, 18.88it/s, est. speed input: 6868.22 toks/s, output: 4076.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 365/377 [00:25<00:01, 11.81it/s, est. speed input: 6767.90 toks/s, output: 4059.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 367/377 [00:25<00:00, 11.60it/s, est. speed input: 6758.95 toks/s, output: 4084.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 368/377 [00:39<00:00, 11.60it/s, est. speed input: 6209.16 toks/s, output: 3777.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 369/377 [00:54<00:23, 2.95s/it, est. speed input: 3203.47 toks/s, output: 2017.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [00:56<00:19, 2.83s/it, est. speed input: 3099.17 toks/s, output: 2022.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▊| 372/377 [01:04<00:15, 3.14s/it, est. speed input: 2730.15 toks/s, output: 1919.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [01:06<00:07, 2.60s/it, est. speed input: 2648.08 toks/s, output: 2012.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 377/377 [01:06<00:00, 1.65s/it, est. speed input: 2665.59 toks/s, output: 2255.87 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:06<00:00, 5.64it/s, est. speed input: 2665.59 toks/s, output: 2255.87 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.571782648563385, 'actor/pg_clipfrac': 0.0012172854039818048, 'actor/ppo_kl': 2.293810211995151e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0004053738957736641, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008417367935180664}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.08565539121627808, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009952205000445247}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00020649337966460735, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001169156632386148}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.5477759838104248, 'actor/pg_clipfrac': 0.002722322940826416, 'actor/ppo_kl': -0.0018846075981855392}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.13308152556419373, 'actor/pg_clipfrac': 0.0008628127980045974, 'actor/ppo_kl': -0.00027151493122801185}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.3878633975982666, 'actor/pg_clipfrac': 0.001669449033215642, 'actor/ppo_kl': 0.0005197477294132113}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.18312974274158478, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010213537607342005}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.44973787665367126, 'actor/pg_clipfrac': 0.002020202111452818, 'actor/ppo_kl': 0.0011975105153396726}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.6311337947845459, 'actor/pg_clipfrac': 0.0021276595070958138, 'actor/ppo_kl': -0.0007036858587525785}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.19257616996765137, 'actor/pg_clipfrac': 0.0006882312591187656, 'actor/ppo_kl': -0.0003039365983568132}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.27891916036605835, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016260825796052814}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.17382179200649261, 'actor/pg_clipfrac': 0.0008984726155176759, 'actor/ppo_kl': 0.001634796615689993}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.5748914480209351, 'actor/pg_clipfrac': 0.0008368201088160276, 'actor/ppo_kl': 5.8939465816365555e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.026635214686393738, 'actor/pg_clipfrac': 0.000546746829058975, 'actor/ppo_kl': -0.00046788647887296975}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00032139290124177933, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013050298439338803}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.59 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:47:03 [executor_base.py:219] It took 0.340002 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:47:03 [executor_base.py:219] It took 0.339936 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:49:01 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:49:02 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:49:02 [executor_base.py:208] It took 0.328201 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m validation generation end
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:49:02 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:49:02 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:49:02 [executor_base.py:208] It took 0.327175 seconds to fall asleep.
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to consider the properties of the triangle and the given conditions. Since D is the midpoint of AB, AD = DB = 2.0. The line l is a vertical line passing through D, and E is any point on l. This means that AE + ED is minimized when E is directly below D on line l.\n\nThe perimeter of triangle ABC is given as 16.0. Since D is the midpoint of AB, AD + DB = AB = 2 * 2.0 = 4.0. Therefore, AC + BC = 16.0 - 4.0 = 12.0.\n\nNow, to minimize AE + EC, we note that the shortest path from A to E and from E to C (over the line l) is a straight line. Thus, the minimum perimeter of triangle AEC is simply the length of AC, which is 12.0.\n\nTherefore, the minimum perimeter of triangle AEC is 12.0.\n\n\n12.0\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated clockwise around point A by 90.0 degrees to obtain triangle AB'C', we know that angle BAC = 90.0 degrees. Since the rotation is 90.0 degrees, angle B'AC' = 90.0 degrees as well and triangle AB'C' is congruent to triangle ABC. \n\nGiven that angle CC'B' = 32.0 degrees, we can determine angle AC'B' as follows:\n- Since angle BAC = 90.0 degrees, and angle AB'C' = angle BAC = 90.0 degrees (because of the rotation), angle AC'B' = 90.0 degrees - angle CC'B' = 90.0 degrees - 32.0 degrees = 58.0 degrees.\n\nTherefore, angle AC'B' = 58.0 degrees.\n\n\nThe size of angle AC'B' is 58.0 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that A, B, and D are vertices of a parallelogram ABCD and lie on circle O, and C is on the diameter BE of circle O, we can deduce several properties. Since C is on the diameter BE, angle BAE is a right angle (90°) because any angle inscribed in a semicircle is a right angle. Given that angle ADC = 54°, and since ABCD is a parallelogram, angle ABC = angle ADC = 54°. Therefore, angle AEB, being the angle of the triangle ABE, can be found by subtracting angle BAE (90°) and angle ABC (54°) from 180°, as the sum of angles in a triangle is 180°. Thus, angle AEB = 180° - 90° - 54° = 36°.\n\n\nThe degree of angle AEB is 36°.\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo solve this problem, we need to understand the properties of a circle and the relationships between angles in a circle. Since AB is the diameter of circle O, angle ADB is a right angle (90 degrees) because any angle subtended by a diameter in a semicircle is a right angle. Given that angle A = 35 degrees, we can find angle ADB by subtracting angle A from 90 degrees. Then, since CD is tangent to the circle at D, angle CDB is also a right angle (90 degrees). Therefore, angle C can be found by subtracting angle ADB from 90 degrees.\n\n\nSince angle ADB = 90 - 35 = 55 degrees, and angle CDB is 90 degrees, angle C = 90 - 55 = 35 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O, angle ACD is a right angle (90 degrees) because any angle subtended by a diameter in a circle is a right angle. Given that angle EAC = 120 degrees, we can find angle CAD by subtracting 120 degrees from 180 degrees (since angle ACD is a right angle, 90 degrees), giving us angle CAD = 60 degrees. Since angle ABC is an inscribed angle that intercepts the same arc as angle ADC, it is half of angle ADC. Therefore, angle ABC = 1/2 * angle ADC = 1/2 * 60 degrees = 30 degrees.\n\n\nThe degree of angle ABC is 30 degrees.\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_50
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_65/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_65/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_65/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 65
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.241
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.031
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.009
[36m(Runner pid=3309020)[0m ppo_kl: 1.3735592753505444e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.014
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.014
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.672
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.672
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1001005
[36m(Runner pid=3309020)[0m balanced_min: 1001005
[36m(Runner pid=3309020)[0m max: 1004423
[36m(Runner pid=3309020)[0m mean: 1001005.0
[36m(Runner pid=3309020)[0m min: 997587
[36m(Runner pid=3309020)[0m minmax_diff: 6836
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 112.142
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.137
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.123
[36m(Runner pid=3309020)[0m throughput: 945.22
[36m(Runner pid=3309020)[0m time_per_step: 1059.018
[36m(Runner pid=3309020)[0m total_num_tokens: 2002010
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 466.799
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1527.0
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:14<1:01:38, 2.90s/it, est. speed input: 150.32 toks/s, output: 22.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:25<53:42, 2.54s/it, est. speed input: 174.21 toks/s, output: 38.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<29:47, 1.41s/it, est. speed input: 254.44 toks/s, output: 61.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<22:24, 1.07s/it, est. speed input: 309.18 toks/s, output: 79.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:30<16:07, 1.30it/s, est. speed input: 373.88 toks/s, output: 102.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<12:29, 1.67it/s, est. speed input: 432.73 toks/s, output: 121.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:33<10:33, 1.97it/s, est. speed input: 484.72 toks/s, output: 138.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<08:11, 2.52it/s, est. speed input: 541.15 toks/s, output: 158.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:36<08:21, 2.46it/s, est. speed input: 568.19 toks/s, output: 174.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:36<06:19, 3.24it/s, est. speed input: 625.74 toks/s, output: 196.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:36<04:51, 4.20it/s, est. speed input: 684.40 toks/s, output: 215.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:38<04:51, 4.19it/s, est. speed input: 720.87 toks/s, output: 225.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:38<03:39, 5.53it/s, est. speed input: 771.15 toks/s, output: 247.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:39<03:30, 5.76it/s, est. speed input: 813.93 toks/s, output: 267.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:39<02:00, 9.96it/s, est. speed input: 930.70 toks/s, output: 307.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:39<01:44, 11.48it/s, est. speed input: 982.86 toks/s, output: 326.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:40<02:03, 9.60it/s, est. speed input: 1022.99 toks/s, output: 347.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:40<01:44, 11.35it/s, est. speed input: 1075.28 toks/s, output: 364.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:41<01:58, 9.98it/s, est. speed input: 1116.32 toks/s, output: 379.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:41<00:59, 19.72it/s, est. speed input: 1283.11 toks/s, output: 455.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:41<00:34, 33.02it/s, est. speed input: 1493.76 toks/s, output: 539.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:41<00:33, 33.72it/s, est. speed input: 1592.50 toks/s, output: 584.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:43<01:03, 17.58it/s, est. speed input: 1654.27 toks/s, output: 611.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:43<01:00, 18.50it/s, est. speed input: 1695.21 toks/s, output: 635.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:43<01:09, 15.97it/s, est. speed input: 1731.09 toks/s, output: 652.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:44<01:06, 16.82it/s, est. speed input: 1775.73 toks/s, output: 675.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:44<00:32, 33.24it/s, est. speed input: 1979.86 toks/s, output: 775.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:44<00:35, 30.04it/s, est. speed input: 2064.12 toks/s, output: 819.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:44<00:25, 42.19it/s, est. speed input: 2209.11 toks/s, output: 899.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:44<00:23, 45.20it/s, est. speed input: 2298.37 toks/s, output: 938.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:45<00:26, 39.08it/s, est. speed input: 2386.99 toks/s, output: 981.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:45<00:24, 43.00it/s, est. speed input: 2483.69 toks/s, output: 1029.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:45<00:17, 57.56it/s, est. speed input: 2625.63 toks/s, output: 1097.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:45<00:21, 46.08it/s, est. speed input: 2703.82 toks/s, output: 1148.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:46<00:18, 52.77it/s, est. speed input: 2839.45 toks/s, output: 1220.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:46<00:20, 48.36it/s, est. speed input: 2927.37 toks/s, output: 1263.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:46<00:24, 40.43it/s, est. speed input: 3005.32 toks/s, output: 1309.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:47<00:32, 29.51it/s, est. speed input: 3026.38 toks/s, output: 1325.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:47<00:28, 33.82it/s, est. speed input: 3106.79 toks/s, output: 1375.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:47<00:26, 35.90it/s, est. speed input: 3148.72 toks/s, output: 1387.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:47<00:23, 39.80it/s, est. speed input: 3228.16 toks/s, output: 1435.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:47<00:28, 33.53it/s, est. speed input: 3260.99 toks/s, output: 1458.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:47<00:22, 41.10it/s, est. speed input: 3348.77 toks/s, output: 1502.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:48<00:15, 59.24it/s, est. speed input: 3530.61 toks/s, output: 1615.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:48<00:14, 61.95it/s, est. speed input: 3611.52 toks/s, output: 1666.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:48<00:11, 74.12it/s, est. speed input: 3746.82 toks/s, output: 1748.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:48<00:11, 74.76it/s, est. speed input: 3832.94 toks/s, output: 1803.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:49<00:23, 37.03it/s, est. speed input: 3873.33 toks/s, output: 1834.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:49<00:17, 49.48it/s, est. speed input: 4002.85 toks/s, output: 1913.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:49<00:14, 58.57it/s, est. speed input: 4127.35 toks/s, output: 1990.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:49<00:17, 47.90it/s, est. speed input: 4196.87 toks/s, output: 2045.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:50<00:16, 48.36it/s, est. speed input: 4274.17 toks/s, output: 2096.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:50<00:12, 61.89it/s, est. speed input: 4400.43 toks/s, output: 2174.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:50<00:11, 67.38it/s, est. speed input: 4526.05 toks/s, output: 2249.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:50<00:12, 64.55it/s, est. speed input: 4603.34 toks/s, output: 2302.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:50<00:12, 62.81it/s, est. speed input: 4679.24 toks/s, output: 2353.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:50<00:09, 74.60it/s, est. speed input: 4839.09 toks/s, output: 2457.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 555/1280 [00:50<00:07, 97.70it/s, est. speed input: 5006.29 toks/s, output: 2583.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:51<00:07, 96.39it/s, est. speed input: 5130.90 toks/s, output: 2653.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:52<00:17, 39.40it/s, est. speed input: 5174.39 toks/s, output: 2719.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:52<00:15, 44.52it/s, est. speed input: 5245.65 toks/s, output: 2760.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:52<00:11, 56.13it/s, est. speed input: 5370.61 toks/s, output: 2841.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:52<00:08, 75.30it/s, est. speed input: 5534.03 toks/s, output: 2965.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:52<00:06, 102.76it/s, est. speed input: 5745.74 toks/s, output: 3111.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:52<00:05, 102.04it/s, est. speed input: 5895.37 toks/s, output: 3219.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:52<00:06, 94.61it/s, est. speed input: 6008.75 toks/s, output: 3289.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:53<00:06, 93.35it/s, est. speed input: 6119.71 toks/s, output: 3388.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:53<00:05, 97.21it/s, est. speed input: 6237.62 toks/s, output: 3481.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:53<00:04, 128.98it/s, est. speed input: 6481.67 toks/s, output: 3660.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:53<00:05, 93.70it/s, est. speed input: 6578.11 toks/s, output: 3728.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:53<00:05, 92.84it/s, est. speed input: 6685.88 toks/s, output: 3820.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 795/1280 [00:53<00:04, 99.64it/s, est. speed input: 6797.81 toks/s, output: 3926.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:54<00:04, 115.62it/s, est. speed input: 6963.77 toks/s, output: 4051.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▌ | 835/1280 [00:54<00:03, 128.81it/s, est. speed input: 7122.91 toks/s, output: 4187.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 855/1280 [00:54<00:03, 133.36it/s, est. speed input: 7275.26 toks/s, output: 4319.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:54<00:02, 137.28it/s, est. speed input: 7434.67 toks/s, output: 4433.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:54<00:03, 116.74it/s, est. speed input: 7542.62 toks/s, output: 4508.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:54<00:02, 166.00it/s, est. speed input: 7829.08 toks/s, output: 4759.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 945/1280 [00:54<00:02, 154.27it/s, est. speed input: 7977.61 toks/s, output: 4881.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▌ | 965/1280 [00:55<00:02, 148.89it/s, est. speed input: 8128.28 toks/s, output: 5025.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:55<00:02, 127.66it/s, est. speed input: 8257.08 toks/s, output: 5126.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:55<00:02, 119.81it/s, est. speed input: 8363.82 toks/s, output: 5206.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:55<00:02, 111.49it/s, est. speed input: 8478.40 toks/s, output: 5326.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:55<00:02, 117.89it/s, est. speed input: 8591.39 toks/s, output: 5419.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:55<00:01, 153.58it/s, est. speed input: 8823.75 toks/s, output: 5635.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:56<00:01, 124.31it/s, est. speed input: 8947.44 toks/s, output: 5773.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:56<00:01, 151.29it/s, est. speed input: 9179.95 toks/s, output: 5964.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:56<00:01, 132.26it/s, est. speed input: 9304.42 toks/s, output: 6100.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1145/1280 [00:56<00:01, 131.84it/s, est. speed input: 9404.74 toks/s, output: 6210.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:56<00:01, 102.49it/s, est. speed input: 9482.16 toks/s, output: 6312.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:56<00:01, 91.80it/s, est. speed input: 9571.52 toks/s, output: 6403.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:57<00:01, 77.57it/s, est. speed input: 9645.40 toks/s, output: 6499.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:57<00:01, 78.15it/s, est. speed input: 9706.31 toks/s, output: 6554.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:57<00:00, 71.18it/s, est. speed input: 9754.35 toks/s, output: 6633.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:57<00:00, 71.82it/s, est. speed input: 9807.43 toks/s, output: 6692.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:58<00:01, 47.42it/s, est. speed input: 9816.93 toks/s, output: 6727.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:58<00:00, 36.28it/s, est. speed input: 9841.84 toks/s, output: 6810.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:59<00:01, 25.29it/s, est. speed input: 9787.91 toks/s, output: 6786.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [01:00<00:01, 12.52it/s, est. speed input: 9604.91 toks/s, output: 6692.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [01:01<00:01, 13.35it/s, est. speed input: 9583.18 toks/s, output: 6730.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [01:02<00:01, 8.53it/s, est. speed input: 9394.06 toks/s, output: 6640.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:12<00:02, 1.96it/s, est. speed input: 8157.96 toks/s, output: 5811.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:24<00:02, 1.96it/s, est. speed input: 8157.96 toks/s, output: 5811.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:30<00:00, 1.24s/it, est. speed input: 6551.57 toks/s, output: 4720.52 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:30<00:00, 14.12it/s, est. speed input: 6551.57 toks/s, output: 4720.52 toks/s]
[36m(Runner pid=3309020)[0m mean: 315.236
[36m(Runner pid=3309020)[0m min: 60.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.346
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.672
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.131
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.045
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.282
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.233
[36m(Runner pid=3309020)[0m gen: 105.901
[36m(Runner pid=3309020)[0m old: 88.153
[36m(Runner pid=3309020)[0m ref: 89.171
[36m(Runner pid=3309020)[0m reward: 6.13
[36m(Runner pid=3309020)[0m save_checkpoint: 32.685
[36m(Runner pid=3309020)[0m step: 1059.018
[36m(Runner pid=3309020)[0m update_actor: 565.324
[36m(Runner pid=3309020)[0m validation: 170.773
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.423
[36m(Runner pid=3309020)[0m format_reward: 0.975
[36m(Runner pid=3309020)[0m overall_reward: 0.7
[36m(Runner pid=3309020)[0m reward_score: 0.7
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.981
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 66; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_65/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_65/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_65/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:49:55 [executor_base.py:219] It took 0.340455 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:51:36 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:49:55 [executor_base.py:219] It took 0.340127 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:51:36 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 13:51:36 [executor_base.py:208] It took 0.327596 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:52:21 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:52:21 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 13:52:21 [executor_base.py:208] It took 0.326666 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.82 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.46506813168525696, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.4020022749900818, 'actor/pg_clipfrac': 0.00100351229775697, 'actor/ppo_kl': -0.0011290834518149495}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.4417308568954468, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.42774686217308044, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.10723711550235748, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.13163742423057556, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001081790032912977}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.43017151951789856, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001132558609242551}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00022002958576194942, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5155621767044067, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00077545311069116}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.09356957674026489, 'actor/pg_clipfrac': 0.0005602241144515574, 'actor/ppo_kl': 0.0002122114965459332}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.8424280285835266, 'actor/pg_clipfrac': 0.002518891589716077, 'actor/ppo_kl': 0.0002741169009823352}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.07966397702693939, 'actor/pg_clipfrac': 0.0005491488263942301, 'actor/ppo_kl': -0.0003133257559966296}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.046479731798172, 'actor/pg_clipfrac': 0.0016420361353084445, 'actor/ppo_kl': 0.0017321388004347682}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.3945235311985016, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.23432806134223938, 'actor/pg_clipfrac': 0.001171646174043417, 'actor/ppo_kl': -0.0007133506005629897}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.6887407898902893, 'actor/pg_clipfrac': 0.00047438329784199595, 'actor/ppo_kl': 0.00022482102212961763}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.44749486446380615, 'actor/pg_clipfrac': 0.001816530479118228, 'actor/ppo_kl': 0.0009990556864067912}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0003005665785167366, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007233293144963682}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0003528303059283644, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009867873741313815}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.17785315215587616, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007740783039480448}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.19887837767601013, 'actor/pg_clipfrac': 0.0006071645184420049, 'actor/ppo_kl': 0.00051958515541628}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.2295018583536148, 'actor/pg_clipfrac': 0.001435750164091587, 'actor/ppo_kl': 0.0005257668672129512}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.000309774826746434, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00011774549057008699}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00033037696266546845, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 3.939594535040669e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.12954942882061005, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006188447005115449}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.48734956979751587, 'actor/pg_clipfrac': 0.0007446016534231603, 'actor/ppo_kl': 0.001129291020333767}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003030127554666251, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001504201558418572}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.8137467503547668, 'actor/pg_clipfrac': 0.002851711120456457, 'actor/ppo_kl': -0.00041845813393592834}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.2794446051120758, 'actor/pg_clipfrac': 0.0010608204174786806, 'actor/ppo_kl': 0.0004439832700882107}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.18094879388809204, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00041384954238310456}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00025738580734468997, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016888165846467018}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.2690361738204956, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00017037306679412723}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.07237433642148972, 'actor/pg_clipfrac': 0.003762227250263095, 'actor/ppo_kl': 0.0015504476614296436}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.5506099462509155, 'actor/pg_clipfrac': 0.0031826861668378115, 'actor/ppo_kl': -0.0003513527917675674}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.0002391486632404849, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003111132245976478}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.22833658754825592, 'actor/pg_clipfrac': 0.003392130369320512, 'actor/ppo_kl': 0.0012767460430040956}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00018997078586835414, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008331950521096587}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.10415700823068619, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005770207499153912}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4477350413799286, 'actor/pg_clipfrac': 0.0010427528759464622, 'actor/ppo_kl': 0.00031651047174818814}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0845012441277504, 'actor/pg_clipfrac': 0.00048756704200059175, 'actor/ppo_kl': -0.00034589131246320903}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.3925035297870636, 'actor/pg_clipfrac': 0.0007698229164816439, 'actor/ppo_kl': 0.0007898845942690969}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.20551817119121552, 'actor/pg_clipfrac': 0.0009578543831594288, 'actor/ppo_kl': 0.0017942030681297183}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00036267060204409063, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006202479708008468}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2259606570005417, 'actor/pg_clipfrac': 0.0006891798693686724, 'actor/ppo_kl': -0.001738441176712513}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.030325589701533318, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00034808009513653815}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.16941912472248077, 'actor/pg_clipfrac': 0.001100110006518662, 'actor/ppo_kl': -0.0008353620651178062}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.03855365514755249, 'actor/pg_clipfrac': 0.0028680688701570034, 'actor/ppo_kl': 0.000968504580669105}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0002700841869227588, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006016303086653352}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00019510621496010572, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 7.460725464625284e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.06855040788650513, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011679631425067782}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.17518621683120728, 'actor/pg_clipfrac': 0.0009208103292621672, 'actor/ppo_kl': -0.000837247003801167}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0004013779398519546, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000882122665643692}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002176289417548105, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005064677679911256}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.5001179575920105, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000495283049531281}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0001911862491397187, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001225095591507852}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.21491660177707672, 'actor/pg_clipfrac': 0.0021707669366151094, 'actor/ppo_kl': -0.0012341276742517948}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.7973812222480774, 'actor/pg_clipfrac': 0.0007955449400469661, 'actor/ppo_kl': 0.00038589289761148393}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.10698163509368896, 'actor/pg_clipfrac': 0.0007980845985002816, 'actor/ppo_kl': 0.0003400058194529265}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.08734966069459915, 'actor/pg_clipfrac': 0.0012322858674451709, 'actor/ppo_kl': 0.0009857913246378303}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0004211754712741822, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003597462782636285}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.3617516756057739, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000565818278118968}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.32219529151916504, 'actor/pg_clipfrac': 0.0011135857785120606, 'actor/ppo_kl': 0.000345358595950529}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.35270020365715027, 'actor/pg_clipfrac': 0.0006123698549345136, 'actor/ppo_kl': 0.0005646686186082661}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.18158861994743347, 'actor/pg_clipfrac': 0.0026200872380286455, 'actor/ppo_kl': -0.0010219058021903038}
[36m(Runner pid=3309020)[0m Step 66
[36m(Runner pid=3309020)[0m actor:
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:25<1:47:15, 5.05s/it, est. speed input: 88.16 toks/s, output: 24.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<46:52, 2.21s/it, est. speed input: 171.81 toks/s, output: 48.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:28<28:43, 1.36s/it, est. speed input: 239.71 toks/s, output: 66.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:30<20:52, 1.01it/s, est. speed input: 299.69 toks/s, output: 90.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<15:12, 1.38it/s, est. speed input: 358.04 toks/s, output: 108.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:34<11:01, 1.88it/s, est. speed input: 453.08 toks/s, output: 140.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:36<07:22, 2.79it/s, est. speed input: 568.56 toks/s, output: 180.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:36<05:51, 3.50it/s, est. speed input: 627.76 toks/s, output: 202.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:36<04:47, 4.26it/s, est. speed input: 684.04 toks/s, output: 223.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:37<03:18, 6.11it/s, est. speed input: 797.24 toks/s, output: 261.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:37<02:50, 7.11it/s, est. speed input: 850.25 toks/s, output: 278.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:37<02:17, 8.80it/s, est. speed input: 904.55 toks/s, output: 303.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:38<02:00, 9.97it/s, est. speed input: 959.65 toks/s, output: 322.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:38<01:42, 11.67it/s, est. speed input: 1015.53 toks/s, output: 343.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:39<01:36, 12.24it/s, est. speed input: 1108.35 toks/s, output: 387.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:39<01:06, 17.80it/s, est. speed input: 1218.58 toks/s, output: 430.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:39<01:10, 16.62it/s, est. speed input: 1268.92 toks/s, output: 453.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:39<01:01, 18.91it/s, est. speed input: 1316.96 toks/s, output: 477.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:40<00:58, 19.85it/s, est. speed input: 1361.57 toks/s, output: 500.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:40<00:47, 24.24it/s, est. speed input: 1465.28 toks/s, output: 544.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:40<00:51, 22.30it/s, est. speed input: 1511.62 toks/s, output: 560.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:41<01:07, 16.72it/s, est. speed input: 1594.49 toks/s, output: 601.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:42<00:53, 21.00it/s, est. speed input: 1782.12 toks/s, output: 694.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:42<00:57, 19.18it/s, est. speed input: 1860.14 toks/s, output: 726.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:43<00:47, 23.13it/s, est. speed input: 2001.46 toks/s, output: 789.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:43<00:43, 24.82it/s, est. speed input: 2049.14 toks/s, output: 816.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:43<00:42, 25.33it/s, est. speed input: 2093.43 toks/s, output: 830.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:43<00:32, 33.10it/s, est. speed input: 2190.76 toks/s, output: 886.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:43<00:38, 27.86it/s, est. speed input: 2229.58 toks/s, output: 907.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:44<00:32, 32.47it/s, est. speed input: 2413.53 toks/s, output: 1002.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:44<00:34, 30.24it/s, est. speed input: 2454.10 toks/s, output: 1023.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:44<00:32, 32.12it/s, est. speed input: 2494.53 toks/s, output: 1050.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:45<00:26, 38.47it/s, est. speed input: 2589.97 toks/s, output: 1100.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:45<00:27, 36.90it/s, est. speed input: 2631.50 toks/s, output: 1118.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:45<00:25, 38.73it/s, est. speed input: 2765.05 toks/s, output: 1193.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:45<00:20, 47.41it/s, est. speed input: 2855.47 toks/s, output: 1242.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:46<00:31, 31.69it/s, est. speed input: 2926.39 toks/s, output: 1291.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:46<00:16, 57.98it/s, est. speed input: 3157.57 toks/s, output: 1424.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:46<00:17, 55.46it/s, est. speed input: 3285.13 toks/s, output: 1506.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:46<00:15, 60.74it/s, est. speed input: 3425.45 toks/s, output: 1587.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:46<00:15, 59.42it/s, est. speed input: 3510.84 toks/s, output: 1627.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:47<00:14, 62.70it/s, est. speed input: 3600.05 toks/s, output: 1673.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:47<00:12, 70.53it/s, est. speed input: 3822.37 toks/s, output: 1800.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:47<00:14, 60.20it/s, est. speed input: 3900.96 toks/s, output: 1844.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:47<00:15, 56.74it/s, est. speed input: 3981.93 toks/s, output: 1895.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:47<00:12, 68.44it/s, est. speed input: 4119.06 toks/s, output: 1969.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:48<00:12, 66.32it/s, est. speed input: 4199.16 toks/s, output: 2017.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:48<00:11, 69.56it/s, est. speed input: 4280.20 toks/s, output: 2077.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:48<00:10, 76.18it/s, est. speed input: 4407.44 toks/s, output: 2147.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:48<00:07, 107.22it/s, est. speed input: 4633.22 toks/s, output: 2289.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:48<00:09, 81.93it/s, est. speed input: 4795.49 toks/s, output: 2379.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:49<00:09, 77.44it/s, est. speed input: 4876.23 toks/s, output: 2422.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:49<00:09, 80.26it/s, est. speed input: 4959.81 toks/s, output: 2473.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:49<00:09, 77.94it/s, est. speed input: 5044.05 toks/s, output: 2513.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:49<00:13, 54.75it/s, est. speed input: 5106.36 toks/s, output: 2561.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:49<00:14, 51.09it/s, est. speed input: 5177.94 toks/s, output: 2616.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 585/1280 [00:50<00:09, 72.72it/s, est. speed input: 5381.00 toks/s, output: 2759.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:50<00:08, 82.17it/s, est. speed input: 5509.72 toks/s, output: 2834.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:50<00:08, 77.63it/s, est. speed input: 5584.27 toks/s, output: 2896.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:50<00:05, 116.50it/s, est. speed input: 5842.21 toks/s, output: 3055.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:50<00:05, 106.13it/s, est. speed input: 5961.95 toks/s, output: 3148.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:51<00:07, 77.18it/s, est. speed input: 6057.25 toks/s, output: 3239.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:51<00:08, 74.92it/s, est. speed input: 6131.90 toks/s, output: 3299.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:51<00:08, 73.23it/s, est. speed input: 6208.75 toks/s, output: 3352.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:51<00:06, 84.50it/s, est. speed input: 6330.76 toks/s, output: 3449.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:51<00:06, 89.99it/s, est. speed input: 6449.77 toks/s, output: 3544.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:51<00:05, 94.59it/s, est. speed input: 6569.92 toks/s, output: 3634.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:51<00:06, 83.71it/s, est. speed input: 6681.19 toks/s, output: 3720.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:52<00:05, 94.60it/s, est. speed input: 6802.60 toks/s, output: 3801.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████▏ | 785/1280 [00:52<00:04, 110.27it/s, est. speed input: 6961.91 toks/s, output: 3893.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:52<00:04, 112.01it/s, est. speed input: 7078.73 toks/s, output: 3996.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:52<00:04, 114.32it/s, est. speed input: 7191.33 toks/s, output: 4083.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:52<00:03, 122.36it/s, est. speed input: 7312.01 toks/s, output: 4188.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:52<00:04, 99.79it/s, est. speed input: 7416.17 toks/s, output: 4275.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:52<00:04, 102.10it/s, est. speed input: 7526.42 toks/s, output: 4357.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:52<00:03, 123.81it/s, est. speed input: 7682.47 toks/s, output: 4499.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:53<00:03, 105.41it/s, est. speed input: 7785.53 toks/s, output: 4584.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:53<00:03, 112.67it/s, est. speed input: 7938.74 toks/s, output: 4726.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:53<00:03, 105.02it/s, est. speed input: 8049.74 toks/s, output: 4804.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:53<00:03, 95.86it/s, est. speed input: 8252.83 toks/s, output: 4964.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:53<00:03, 91.92it/s, est. speed input: 8356.53 toks/s, output: 5065.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:54<00:03, 84.73it/s, est. speed input: 8418.72 toks/s, output: 5126.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 995/1280 [00:54<00:03, 88.08it/s, est. speed input: 8529.46 toks/s, output: 5206.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:54<00:02, 95.74it/s, est. speed input: 8641.49 toks/s, output: 5317.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 1020/1280 [00:54<00:02, 95.12it/s, est. speed input: 8702.76 toks/s, output: 5365.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████▏ | 1040/1280 [00:54<00:02, 116.78it/s, est. speed input: 8854.60 toks/s, output: 5511.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:54<00:01, 119.62it/s, est. speed input: 8964.91 toks/s, output: 5621.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:54<00:02, 96.72it/s, est. speed input: 9055.07 toks/s, output: 5701.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:55<00:01, 96.68it/s, est. speed input: 9221.47 toks/s, output: 5877.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:55<00:01, 103.33it/s, est. speed input: 9321.41 toks/s, output: 5973.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:55<00:01, 105.22it/s, est. speed input: 9424.33 toks/s, output: 6066.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:55<00:01, 106.68it/s, est. speed input: 9525.04 toks/s, output: 6188.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:55<00:01, 110.48it/s, est. speed input: 9656.43 toks/s, output: 6357.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:55<00:00, 106.03it/s, est. speed input: 9755.73 toks/s, output: 6453.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1195/1280 [00:56<00:00, 95.40it/s, est. speed input: 9873.75 toks/s, output: 6586.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:56<00:00, 97.18it/s, est. speed input: 9974.79 toks/s, output: 6684.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:56<00:00, 64.42it/s, est. speed input: 10021.07 toks/s, output: 6776.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:56<00:00, 68.53it/s, est. speed input: 10087.00 toks/s, output: 6852.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:57<00:00, 78.11it/s, est. speed input: 10227.54 toks/s, output: 7038.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:57<00:00, 50.59it/s, est. speed input: 10251.31 toks/s, output: 7101.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:02<00:00, 7.83it/s, est. speed input: 9468.57 toks/s, output: 6605.77 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:02<00:00, 20.35it/s, est. speed input: 9468.57 toks/s, output: 6605.77 toks/s]
[36m(Runner pid=3309020)[0m grad_norm: 0.256
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.021
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: 1.714882297214615e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.024
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.024
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.67
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.67
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1019354
[36m(Runner pid=3309020)[0m balanced_min: 1018375
[36m(Runner pid=3309020)[0m max: 1021523
[36m(Runner pid=3309020)[0m mean: 1018864.5
[36m(Runner pid=3309020)[0m min: 1016206
[36m(Runner pid=3309020)[0m minmax_diff: 5317
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 112.221
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.137
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.125
[36m(Runner pid=3309020)[0m throughput: 1119.016
[36m(Runner pid=3309020)[0m time_per_step: 910.5
[36m(Runner pid=3309020)[0m total_num_tokens: 2037729
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 629.0
[36m(Runner pid=3309020)[0m mean: 464.943
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 5128.0
[36m(Runner pid=3309020)[0m mean: 331.045
[36m(Runner pid=3309020)[0m min: 54.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.343
[36m(Runner pid=3309020)[0m format: 0.996
[36m(Runner pid=3309020)[0m overall: 0.67
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.322846215539559e-05
[36m(Runner pid=3309020)[0m gen: 0.188
[36m(Runner pid=3309020)[0m old: 0.043
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.277
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.19
[36m(Runner pid=3309020)[0m gen: 159.599
[36m(Runner pid=3309020)[0m old: 88.456
[36m(Runner pid=3309020)[0m ref: 89.119
[36m(Runner pid=3309020)[0m reward: 7.172
[36m(Runner pid=3309020)[0m step: 910.5
[36m(Runner pid=3309020)[0m update_actor: 565.285
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 67; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.71 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:05:06 [executor_base.py:219] It took 0.338623 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.74 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:06:41 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:05:06 [executor_base.py:219] It took 0.341382 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:06:41 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:06:41 [executor_base.py:208] It took 0.330457 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:06:58 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:06:58 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:06:58 [executor_base.py:208] It took 0.327744 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002121783618349582, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000424884376116097}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.7539800405502319, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.13415269553661346, 'actor/pg_clipfrac': 0.000908265239559114, 'actor/ppo_kl': 0.0006687743007205427}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0004312628007028252, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001997640123590827}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.14371757209300995, 'actor/pg_clipfrac': 0.0013504389207810163, 'actor/ppo_kl': -0.0007309124921448529}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.26543211936950684, 'actor/pg_clipfrac': 0.0013831258984282613, 'actor/ppo_kl': 0.0013537162449210882}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.24007152020931244, 'actor/pg_clipfrac': 0.0014204545877873898, 'actor/ppo_kl': 0.0011502341367304325}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.41633057594299316, 'actor/pg_clipfrac': 0.0006249999860301614, 'actor/ppo_kl': 0.0009552121045999229}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.5320437550544739, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.3612577021121979, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0002909272734541446, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.13760504126548767, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0003703403053805232, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.1930961012840271, 'actor/pg_clipfrac': 0.0008285004296340048, 'actor/ppo_kl': -8.692737173987553e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.1250493824481964, 'actor/pg_clipfrac': 0.0009293680195696652, 'actor/ppo_kl': -0.0015477354172617197}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.39507612586021423, 'actor/pg_clipfrac': 0.0009737098589539528, 'actor/ppo_kl': 0.0006858190754428506}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.5237647891044617, 'actor/pg_clipfrac': 0.0016849199309945107, 'actor/ppo_kl': 0.00013602753460872918}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.12154846638441086, 'actor/pg_clipfrac': 0.0006747638108208776, 'actor/ppo_kl': -0.00022720872948411852}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.11710021644830704, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005481690750457346}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.24487926065921783, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006677435012534261}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.31230488419532776, 'actor/pg_clipfrac': 0.0010964912362396717, 'actor/ppo_kl': -0.0018914728425443172}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.2681615948677063, 'actor/pg_clipfrac': 0.0005927682504989207, 'actor/ppo_kl': -0.0007703800220042467}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002925280132330954, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00011514734069351107}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2806847095489502, 'actor/pg_clipfrac': 0.00161485665012151, 'actor/ppo_kl': 0.00025915453443303704}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.11930321902036667, 'actor/pg_clipfrac': 0.0008176614646799862, 'actor/ppo_kl': -0.0007473937002941966}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.11807034909725189, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008425437845289707}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0002689420944079757, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005313348374329507}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.043111152946949005, 'actor/pg_clipfrac': 0.0004739336436614394, 'actor/ppo_kl': -0.00130754173733294}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.7450327277183533, 'actor/pg_clipfrac': 0.0009259259095415473, 'actor/ppo_kl': 0.0006350676412694156}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.6020627617835999, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000584278313908726}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.10640417784452438, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008895549108274281}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0003465283662080765, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014234819682314992}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.1719110608100891, 'actor/pg_clipfrac': 0.0005307855899445713, 'actor/ppo_kl': -0.0003595210437197238}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.000380173500161618, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012667946284636855}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.15192911028862, 'actor/pg_clipfrac': 0.0007830853573977947, 'actor/ppo_kl': -0.000442745367763564}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.10310633480548859, 'actor/pg_clipfrac': 0.0009041591547429562, 'actor/ppo_kl': -0.00014899221423547715}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.11714193969964981, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011562915751710534}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.17925401031970978, 'actor/pg_clipfrac': 0.0013054830487817526, 'actor/ppo_kl': 0.0007826653309166431}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.16146191954612732, 'actor/pg_clipfrac': 0.0018137847073376179, 'actor/ppo_kl': 0.0008236857247538865}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.004106540232896805, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001147654140368104}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0003205526154488325, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008636880666017532}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.18437781929969788, 'actor/pg_clipfrac': 0.0006430867942981422, 'actor/ppo_kl': 0.0007168282172642648}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.0022816769778728485, 'actor/pg_clipfrac': 0.0009496675920672715, 'actor/ppo_kl': -0.0003158482431899756}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.4554433226585388, 'actor/pg_clipfrac': 0.0013628620654344559, 'actor/ppo_kl': 0.0012131627881899476}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0005212855758145452, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00019988704298157245}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0005233713309280574, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007521783118136227}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.000209626552532427, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -1.2682088708970696e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.15044932067394257, 'actor/pg_clipfrac': 0.0012476606061682105, 'actor/ppo_kl': -0.0022557354532182217}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.26745712757110596, 'actor/pg_clipfrac': 0.002030456904321909, 'actor/ppo_kl': -0.00027132953982800245}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0003676674677990377, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013188301818445325}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.12912555038928986, 'actor/pg_clipfrac': 0.004415011033415794, 'actor/ppo_kl': -0.0013444197829812765}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.08686058223247528, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -5.0545757403597236e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.05462545156478882, 'actor/pg_clipfrac': 0.0007961783558130264, 'actor/ppo_kl': -0.0010064863599836826}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.07882149517536163, 'actor/pg_clipfrac': 0.0030075188260525465, 'actor/ppo_kl': 2.995541217387654e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.000347849796526134, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009187015239149332}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.34364399313926697, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0015589639078825712}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00015169517428148538, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013548204442486167}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:37:08, 4.57s/it, est. speed input: 102.16 toks/s, output: 26.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:23<40:14, 1.90s/it, est. speed input: 203.76 toks/s, output: 52.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<27:54, 1.32s/it, est. speed input: 266.77 toks/s, output: 71.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:28<20:47, 1.01it/s, est. speed input: 325.38 toks/s, output: 90.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<18:01, 1.16it/s, est. speed input: 365.78 toks/s, output: 104.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:33<13:38, 1.53it/s, est. speed input: 417.08 toks/s, output: 125.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:34<08:10, 2.53it/s, est. speed input: 534.46 toks/s, output: 164.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:35<07:08, 2.88it/s, est. speed input: 582.78 toks/s, output: 180.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:36<06:22, 3.22it/s, est. speed input: 627.76 toks/s, output: 195.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:36<03:42, 5.49it/s, est. speed input: 753.86 toks/s, output: 235.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<02:26, 8.24it/s, est. speed input: 872.98 toks/s, output: 278.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:36<02:02, 9.82it/s, est. speed input: 931.23 toks/s, output: 302.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<01:51, 10.80it/s, est. speed input: 985.39 toks/s, output: 321.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:37<01:32, 12.99it/s, est. speed input: 1041.31 toks/s, output: 342.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:37<01:08, 17.38it/s, est. speed input: 1153.97 toks/s, output: 386.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:37<00:52, 22.47it/s, est. speed input: 1269.24 toks/s, output: 430.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:38<00:42, 27.14it/s, est. speed input: 1383.47 toks/s, output: 473.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:44, 26.00it/s, est. speed input: 1432.50 toks/s, output: 495.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:38<00:40, 28.49it/s, est. speed input: 1543.08 toks/s, output: 541.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:39<01:01, 18.47it/s, est. speed input: 1576.88 toks/s, output: 558.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:39<00:50, 22.50it/s, est. speed input: 1675.88 toks/s, output: 606.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:39<00:38, 29.51it/s, est. speed input: 1800.99 toks/s, output: 653.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:39<00:30, 36.71it/s, est. speed input: 1913.14 toks/s, output: 700.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:40<00:32, 33.99it/s, est. speed input: 2007.76 toks/s, output: 740.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:40<00:31, 34.45it/s, est. speed input: 2058.16 toks/s, output: 760.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:40<00:31, 34.98it/s, est. speed input: 2111.15 toks/s, output: 786.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:27, 39.38it/s, est. speed input: 2217.37 toks/s, output: 830.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:40<00:19, 55.51it/s, est. speed input: 2382.95 toks/s, output: 907.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:41<00:23, 44.50it/s, est. speed input: 2475.56 toks/s, output: 955.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:41<00:28, 37.45it/s, est. speed input: 2563.22 toks/s, output: 1002.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:42<00:45, 23.21it/s, est. speed input: 2586.19 toks/s, output: 1012.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:42<00:37, 27.36it/s, est. speed input: 2681.52 toks/s, output: 1066.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:42<00:39, 25.96it/s, est. speed input: 2722.65 toks/s, output: 1089.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:43<00:51, 19.92it/s, est. speed input: 2745.27 toks/s, output: 1101.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:43<00:35, 28.20it/s, est. speed input: 2850.29 toks/s, output: 1155.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:43<00:33, 30.26it/s, est. speed input: 2936.26 toks/s, output: 1205.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:43<00:33, 30.15it/s, est. speed input: 2979.69 toks/s, output: 1213.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:43<00:30, 32.46it/s, est. speed input: 3024.66 toks/s, output: 1237.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:44<00:30, 32.27it/s, est. speed input: 3114.24 toks/s, output: 1287.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:44<00:49, 19.68it/s, est. speed input: 3163.46 toks/s, output: 1313.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:45<00:43, 22.44it/s, est. speed input: 3211.59 toks/s, output: 1338.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:45<00:44, 21.90it/s, est. speed input: 3246.14 toks/s, output: 1364.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:45<00:42, 22.66it/s, est. speed input: 3283.00 toks/s, output: 1386.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:45<00:25, 37.69it/s, est. speed input: 3424.85 toks/s, output: 1463.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 345/1280 [00:45<00:21, 44.31it/s, est. speed input: 3518.25 toks/s, output: 1516.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:46<00:13, 68.38it/s, est. speed input: 3801.84 toks/s, output: 1673.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:46<00:12, 68.70it/s, est. speed input: 3932.15 toks/s, output: 1745.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:46<00:16, 53.14it/s, est. speed input: 4005.14 toks/s, output: 1792.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:46<00:13, 64.39it/s, est. speed input: 4136.48 toks/s, output: 1864.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:46<00:11, 75.29it/s, est. speed input: 4274.05 toks/s, output: 1955.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:46<00:10, 76.49it/s, est. speed input: 4359.96 toks/s, output: 2004.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:47<00:15, 52.97it/s, est. speed input: 4425.12 toks/s, output: 2053.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:47<00:17, 47.19it/s, est. speed input: 4497.50 toks/s, output: 2087.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 470/1280 [00:47<00:15, 50.66it/s, est. speed input: 4582.51 toks/s, output: 2131.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:47<00:10, 72.79it/s, est. speed input: 4759.32 toks/s, output: 2250.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:48<00:10, 75.27it/s, est. speed input: 4846.81 toks/s, output: 2312.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:48<00:14, 54.92it/s, est. speed input: 4909.58 toks/s, output: 2350.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:48<00:15, 48.30it/s, est. speed input: 4979.50 toks/s, output: 2390.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:48<00:12, 60.84it/s, est. speed input: 5102.90 toks/s, output: 2452.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:48<00:12, 57.89it/s, est. speed input: 5175.00 toks/s, output: 2512.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:49<00:10, 70.32it/s, est. speed input: 5300.93 toks/s, output: 2578.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:49<00:09, 72.23it/s, est. speed input: 5380.47 toks/s, output: 2641.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:49<00:10, 69.96it/s, est. speed input: 5454.88 toks/s, output: 2695.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:49<00:09, 70.78it/s, est. speed input: 5608.32 toks/s, output: 2785.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:49<00:06, 99.15it/s, est. speed input: 5826.56 toks/s, output: 2937.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:50<00:07, 86.39it/s, est. speed input: 5938.43 toks/s, output: 3028.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:50<00:06, 93.86it/s, est. speed input: 6153.32 toks/s, output: 3170.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:50<00:06, 90.50it/s, est. speed input: 6271.37 toks/s, output: 3253.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:50<00:06, 88.44it/s, est. speed input: 6347.81 toks/s, output: 3312.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:50<00:05, 95.89it/s, est. speed input: 6479.33 toks/s, output: 3398.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:50<00:05, 98.96it/s, est. speed input: 6641.56 toks/s, output: 3526.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:51<00:06, 86.87it/s, est. speed input: 6753.30 toks/s, output: 3617.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:51<00:06, 84.26it/s, est. speed input: 6826.28 toks/s, output: 3670.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|██████ | 770/1280 [00:51<00:05, 94.51it/s, est. speed input: 6980.98 toks/s, output: 3780.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:51<00:05, 94.31it/s, est. speed input: 7050.58 toks/s, output: 3827.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:51<00:05, 94.36it/s, est. speed input: 7123.82 toks/s, output: 3877.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:51<00:05, 95.40it/s, est. speed input: 7202.61 toks/s, output: 3957.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:51<00:04, 103.95it/s, est. speed input: 7323.62 toks/s, output: 4049.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:52<00:04, 92.54it/s, est. speed input: 7437.34 toks/s, output: 4151.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:52<00:04, 95.72it/s, est. speed input: 7586.06 toks/s, output: 4273.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:52<00:04, 103.26it/s, est. speed input: 7706.68 toks/s, output: 4362.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:52<00:03, 110.19it/s, est. speed input: 7821.63 toks/s, output: 4442.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:52<00:03, 116.20it/s, est. speed input: 7942.08 toks/s, output: 4535.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:52<00:04, 76.00it/s, est. speed input: 8019.99 toks/s, output: 4619.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:53<00:04, 83.35it/s, est. speed input: 8135.72 toks/s, output: 4715.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:53<00:03, 86.39it/s, est. speed input: 8201.07 toks/s, output: 4784.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:53<00:03, 104.65it/s, est. speed input: 8355.37 toks/s, output: 4921.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:53<00:02, 145.05it/s, est. speed input: 8603.34 toks/s, output: 5125.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▊ | 1005/1280 [00:53<00:01, 145.51it/s, est. speed input: 8761.21 toks/s, output: 5271.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:53<00:01, 146.59it/s, est. speed input: 8914.86 toks/s, output: 5384.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:53<00:01, 121.14it/s, est. speed input: 9040.45 toks/s, output: 5525.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:54<00:02, 74.69it/s, est. speed input: 9096.93 toks/s, output: 5596.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:54<00:02, 85.38it/s, est. speed input: 9215.85 toks/s, output: 5699.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:54<00:01, 112.48it/s, est. speed input: 9441.32 toks/s, output: 5922.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:54<00:01, 115.67it/s, est. speed input: 9550.08 toks/s, output: 6041.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:54<00:01, 118.48it/s, est. speed input: 9667.43 toks/s, output: 6166.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:55<00:01, 103.91it/s, est. speed input: 9795.43 toks/s, output: 6263.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:55<00:01, 107.13it/s, est. speed input: 9899.90 toks/s, output: 6393.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:55<00:00, 126.58it/s, est. speed input: 10089.64 toks/s, output: 6604.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:55<00:00, 99.71it/s, est. speed input: 10172.16 toks/s, output: 6708.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:55<00:00, 80.03it/s, est. speed input: 10273.23 toks/s, output: 6823.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:56<00:00, 49.91it/s, est. speed input: 10265.89 toks/s, output: 6869.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:56<00:00, 52.64it/s, est. speed input: 10315.07 toks/s, output: 6937.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:57<00:00, 35.77it/s, est. speed input: 10286.79 toks/s, output: 6954.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:57<00:00, 24.07it/s, est. speed input: 10218.03 toks/s, output: 6936.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:58<00:00, 25.81it/s, est. speed input: 10239.31 toks/s, output: 6972.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:59<00:00, 12.40it/s, est. speed input: 10037.67 toks/s, output: 6850.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:05<00:00, 3.46it/s, est. speed input: 9216.65 toks/s, output: 6313.70 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:05<00:00, 19.69it/s, est. speed input: 9216.65 toks/s, output: 6313.70 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00026891898596659303, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00027108192443847656}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.31391698122024536, 'actor/pg_clipfrac': 0.001124859438277781, 'actor/ppo_kl': -0.00020538433454930782}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.14492422342300415, 'actor/pg_clipfrac': 0.0008984726155176759, 'actor/ppo_kl': -0.0003525716019794345}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.20795351266860962, 'actor/pg_clipfrac': 0.0017482517287135124, 'actor/ppo_kl': -0.00019633852934930474}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.16792313754558563, 'actor/pg_clipfrac': 0.0011135857785120606, 'actor/ppo_kl': 0.0006518565933220088}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0003229052817914635, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 5.494465949595906e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.29536327719688416, 'actor/pg_clipfrac': 0.0007246377062983811, 'actor/ppo_kl': 0.000431301275966689}
[36m(Runner pid=3309020)[0m Step 67
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.25
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.057
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.012
[36m(Runner pid=3309020)[0m ppo_kl: 0.0
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.021
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.685
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.685
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1014659
[36m(Runner pid=3309020)[0m balanced_min: 1014658
[36m(Runner pid=3309020)[0m max: 1018331
[36m(Runner pid=3309020)[0m mean: 1014658.5
[36m(Runner pid=3309020)[0m min: 1010986
[36m(Runner pid=3309020)[0m minmax_diff: 7345
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 110.044
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.226
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.124
[36m(Runner pid=3309020)[0m throughput: 1151.23
[36m(Runner pid=3309020)[0m time_per_step: 881.369
[36m(Runner pid=3309020)[0m total_num_tokens: 2029317
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1016.0
[36m(Runner pid=3309020)[0m mean: 466.113
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3963.0
[36m(Runner pid=3309020)[0m mean: 326.589
[36m(Runner pid=3309020)[0m min: 76.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.371
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.685
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 9.807705994978447e-05
[36m(Runner pid=3309020)[0m gen: 0.152
[36m(Runner pid=3309020)[0m old: 0.045
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.007
[36m(Runner pid=3309020)[0m update_actor: 0.279
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.199
[36m(Runner pid=3309020)[0m gen: 126.965
[36m(Runner pid=3309020)[0m old: 90.561
[36m(Runner pid=3309020)[0m ref: 89.816
[36m(Runner pid=3309020)[0m reward: 6.104
[36m(Runner pid=3309020)[0m step: 881.369
[36m(Runner pid=3309020)[0m update_actor: 567.088
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 68; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.05 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:19:49 [executor_base.py:219] It took 0.339167 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.97 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:21:21 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:19:49 [executor_base.py:219] It took 0.340139 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:21:22 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:21:22 [executor_base.py:208] It took 0.327894 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.77 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.86 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:21:25 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:21:25 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.86 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:21:25 [executor_base.py:208] It took 0.326262 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.019810473546385765, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.14924442768096924, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.14623619616031647, 'actor/pg_clipfrac': 0.0025167784187942743, 'actor/ppo_kl': 0.0002889441093429923}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0004421213234309107, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0004324210749473423, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014986266614869237}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.0418383814394474, 'actor/pg_clipfrac': 0.001226993859745562, 'actor/ppo_kl': 0.0006958710146136582}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.15471050143241882, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00028654365451075137}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1016089916229248, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00043990107951685786, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.00018032376829069108, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00012476214033085853}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.16899438202381134, 'actor/pg_clipfrac': 0.0007342143799178302, 'actor/ppo_kl': 0.001277496456168592}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0007050868007354438, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.22739240527153015, 'actor/pg_clipfrac': 0.005226480774581432, 'actor/ppo_kl': -0.0011639395961537957}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0002189748192904517, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008815891342237592}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0004018026520498097, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.29234737157821655, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.0002035896759480238, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002514541847631335}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.2590966820716858, 'actor/pg_clipfrac': 0.0015128592494875193, 'actor/ppo_kl': 0.0024858638644218445}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.21052002906799316, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011734096333384514}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.20345833897590637, 'actor/pg_clipfrac': 0.001060445443727076, 'actor/ppo_kl': -0.0012132901465520263}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0004191083135083318, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00048463355051353574}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0005720945191569626, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004845206276513636}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.13036781549453735, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001695540704531595}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002553320082370192, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0007698818226344883}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.24384893476963043, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00037537000025622547}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0002570462238509208, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 5.30276884092018e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.6788343787193298, 'actor/pg_clipfrac': 0.0021082221064716578, 'actor/ppo_kl': -0.00014397600898519158}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00023171391512732953, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0023982995189726353}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00027331963065080345, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001078471657820046}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.06480304151773453, 'actor/pg_clipfrac': 0.0011428571306169033, 'actor/ppo_kl': -0.0005512313800863922}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.34362632036209106, 'actor/pg_clipfrac': 0.0008904719725251198, 'actor/ppo_kl': 0.0001753164251567796}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.2367798537015915, 'actor/pg_clipfrac': 0.001183431944809854, 'actor/ppo_kl': -0.0010939694475382566}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.3109810948371887, 'actor/pg_clipfrac': 0.0015037594130262733, 'actor/ppo_kl': -0.001855615060776472}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.054605089128017426, 'actor/pg_clipfrac': 0.0021810249891132116, 'actor/ppo_kl': 0.001727109425701201}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.08183471113443375, 'actor/pg_clipfrac': 0.0007570022717118263, 'actor/ppo_kl': 0.0015126386424526572}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.25594109296798706, 'actor/pg_clipfrac': 0.0015987210208550096, 'actor/ppo_kl': 0.000977809657342732}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00044979597441852093, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005191887030377984}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.27617934346199036, 'actor/pg_clipfrac': 0.004658385179936886, 'actor/ppo_kl': 0.00017800241766963154}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00031847404898144305, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008847058634273708}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002939790429081768, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00032646200270392}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.05105005204677582, 'actor/pg_clipfrac': 0.00070821528788656, 'actor/ppo_kl': 0.0015576676232740283}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.2664449214935303, 'actor/pg_clipfrac': 0.0023752970155328512, 'actor/ppo_kl': -0.00010190044122282416}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.45442280173301697, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005162114393897355}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.11236799508333206, 'actor/pg_clipfrac': 0.0030895983800292015, 'actor/ppo_kl': 0.0006810511695221066}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00027358831721358, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006435848190449178}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.009052183479070663, 'actor/pg_clipfrac': 0.003355704713612795, 'actor/ppo_kl': 0.0010334075195714831}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.04739602282643318, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000393887487007305}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.5433968305587769, 'actor/pg_clipfrac': 0.003218884114176035, 'actor/ppo_kl': -3.9170226955320686e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.35880401730537415, 'actor/pg_clipfrac': 0.001616814872249961, 'actor/ppo_kl': -0.0010087854461744428}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.32147863507270813, 'actor/pg_clipfrac': 0.0020283975172787905, 'actor/ppo_kl': 0.0001725995825836435}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.1352967619895935, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014923949493095279}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:25<1:50:27, 5.20s/it, est. speed input: 90.04 toks/s, output: 28.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:28<52:45, 2.49s/it, est. speed input: 166.80 toks/s, output: 53.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:29<29:20, 1.39s/it, est. speed input: 242.24 toks/s, output: 76.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:29<18:03, 1.16it/s, est. speed input: 320.31 toks/s, output: 102.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<13:37, 1.54it/s, est. speed input: 381.54 toks/s, output: 126.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:34<13:37, 1.53it/s, est. speed input: 412.13 toks/s, output: 139.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:34<09:47, 2.12it/s, est. speed input: 470.09 toks/s, output: 161.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:35<06:59, 2.96it/s, est. speed input: 533.20 toks/s, output: 182.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:35<05:04, 4.06it/s, est. speed input: 595.45 toks/s, output: 202.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:36<04:37, 4.44it/s, est. speed input: 643.56 toks/s, output: 224.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:36<02:57, 6.89it/s, est. speed input: 757.91 toks/s, output: 268.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:37<02:00, 10.07it/s, est. speed input: 868.81 toks/s, output: 312.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:37<02:14, 8.98it/s, est. speed input: 910.50 toks/s, output: 333.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:38<02:11, 9.11it/s, est. speed input: 957.77 toks/s, output: 356.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:38<01:51, 10.75it/s, est. speed input: 1014.13 toks/s, output: 376.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:38<01:39, 11.99it/s, est. speed input: 1066.16 toks/s, output: 403.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:39<01:30, 13.13it/s, est. speed input: 1116.03 toks/s, output: 425.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:39<01:18, 15.00it/s, est. speed input: 1167.95 toks/s, output: 450.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:39<01:10, 16.75it/s, est. speed input: 1218.07 toks/s, output: 477.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:39<00:59, 19.83it/s, est. speed input: 1266.71 toks/s, output: 503.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:39<00:37, 30.48it/s, est. speed input: 1431.37 toks/s, output: 579.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:40<00:48, 23.40it/s, est. speed input: 1521.44 toks/s, output: 624.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:40<00:48, 23.66it/s, est. speed input: 1569.72 toks/s, output: 649.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:41<00:30, 37.13it/s, est. speed input: 1779.73 toks/s, output: 741.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:41<00:37, 29.58it/s, est. speed input: 1896.56 toks/s, output: 787.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:41<00:42, 26.25it/s, est. speed input: 1938.65 toks/s, output: 815.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:41<00:32, 33.61it/s, est. speed input: 2040.91 toks/s, output: 867.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:42<00:36, 29.88it/s, est. speed input: 2124.54 toks/s, output: 919.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:42<00:34, 31.63it/s, est. speed input: 2173.51 toks/s, output: 941.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:42<00:37, 28.75it/s, est. speed input: 2211.89 toks/s, output: 961.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:42<00:34, 31.07it/s, est. speed input: 2263.58 toks/s, output: 986.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:43<00:35, 29.80it/s, est. speed input: 2399.17 toks/s, output: 1055.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:43<00:31, 33.27it/s, est. speed input: 2488.97 toks/s, output: 1105.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:43<00:28, 36.19it/s, est. speed input: 2583.62 toks/s, output: 1150.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:44<00:27, 36.69it/s, est. speed input: 2666.77 toks/s, output: 1194.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:44<00:19, 52.00it/s, est. speed input: 2824.53 toks/s, output: 1283.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:44<00:21, 47.34it/s, est. speed input: 2911.27 toks/s, output: 1322.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:44<00:26, 37.24it/s, est. speed input: 2986.23 toks/s, output: 1362.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 295/1280 [00:44<00:25, 38.78it/s, est. speed input: 3032.55 toks/s, output: 1390.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:45<00:24, 40.43it/s, est. speed input: 3075.16 toks/s, output: 1411.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:45<00:20, 46.78it/s, est. speed input: 3167.54 toks/s, output: 1466.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:45<00:16, 56.90it/s, est. speed input: 3262.93 toks/s, output: 1516.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 330/1280 [00:45<00:15, 59.88it/s, est. speed input: 3359.30 toks/s, output: 1560.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:45<00:18, 52.15it/s, est. speed input: 3440.78 toks/s, output: 1608.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:46<00:20, 45.49it/s, est. speed input: 3516.19 toks/s, output: 1662.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:46<00:15, 59.83it/s, est. speed input: 3652.57 toks/s, output: 1740.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:46<00:13, 67.44it/s, est. speed input: 3784.10 toks/s, output: 1826.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:46<00:14, 60.50it/s, est. speed input: 3866.48 toks/s, output: 1876.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:46<00:18, 47.43it/s, est. speed input: 3938.18 toks/s, output: 1930.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:47<00:15, 57.22it/s, est. speed input: 4068.96 toks/s, output: 2020.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:47<00:17, 49.31it/s, est. speed input: 4145.82 toks/s, output: 2059.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:47<00:15, 55.37it/s, est. speed input: 4230.45 toks/s, output: 2115.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:47<00:14, 57.13it/s, est. speed input: 4314.28 toks/s, output: 2152.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:47<00:15, 51.73it/s, est. speed input: 4391.77 toks/s, output: 2208.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:48<00:11, 70.65it/s, est. speed input: 4570.55 toks/s, output: 2303.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 490/1280 [00:48<00:11, 70.15it/s, est. speed input: 4690.38 toks/s, output: 2387.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:48<00:08, 95.45it/s, est. speed input: 4909.27 toks/s, output: 2516.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:48<00:08, 93.20it/s, est. speed input: 5036.10 toks/s, output: 2592.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:48<00:08, 83.02it/s, est. speed input: 5154.10 toks/s, output: 2677.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:48<00:07, 98.52it/s, est. speed input: 5336.65 toks/s, output: 2818.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:49<00:07, 92.48it/s, est. speed input: 5458.33 toks/s, output: 2890.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:49<00:07, 89.43it/s, est. speed input: 5539.85 toks/s, output: 2948.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:49<00:10, 64.18it/s, est. speed input: 5598.45 toks/s, output: 3006.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:49<00:08, 76.31it/s, est. speed input: 5717.77 toks/s, output: 3093.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:49<00:09, 70.44it/s, est. speed input: 5790.70 toks/s, output: 3137.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:49<00:09, 66.53it/s, est. speed input: 5864.97 toks/s, output: 3191.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:50<00:09, 68.09it/s, est. speed input: 6055.20 toks/s, output: 3334.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:50<00:07, 78.30it/s, est. speed input: 6182.27 toks/s, output: 3416.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:50<00:07, 76.29it/s, est. speed input: 6256.25 toks/s, output: 3485.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:50<00:07, 78.08it/s, est. speed input: 6337.17 toks/s, output: 3541.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:50<00:05, 105.89it/s, est. speed input: 6591.00 toks/s, output: 3731.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:51<00:06, 86.96it/s, est. speed input: 6701.36 toks/s, output: 3812.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 755/1280 [00:51<00:05, 93.27it/s, est. speed input: 6821.34 toks/s, output: 3911.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:51<00:04, 109.23it/s, est. speed input: 6985.88 toks/s, output: 4033.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▎ | 800/1280 [00:51<00:03, 123.70it/s, est. speed input: 7187.57 toks/s, output: 4193.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▎ | 815/1280 [00:51<00:04, 110.77it/s, est. speed input: 7291.89 toks/s, output: 4277.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:51<00:03, 114.10it/s, est. speed input: 7408.96 toks/s, output: 4375.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:51<00:03, 127.81it/s, est. speed input: 7569.59 toks/s, output: 4506.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:52<00:03, 117.93it/s, est. speed input: 7684.21 toks/s, output: 4601.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:52<00:03, 109.43it/s, est. speed input: 7791.70 toks/s, output: 4704.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:52<00:03, 101.16it/s, est. speed input: 7898.86 toks/s, output: 4797.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:52<00:03, 109.55it/s, est. speed input: 8014.60 toks/s, output: 4866.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:52<00:03, 111.04it/s, est. speed input: 8128.26 toks/s, output: 4981.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:52<00:02, 141.98it/s, est. speed input: 8375.18 toks/s, output: 5158.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:53<00:01, 155.70it/s, est. speed input: 8614.87 toks/s, output: 5357.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1010/1280 [00:53<00:01, 168.91it/s, est. speed input: 8817.79 toks/s, output: 5546.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1030/1280 [00:53<00:01, 147.29it/s, est. speed input: 8963.21 toks/s, output: 5685.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1055/1280 [00:53<00:01, 160.57it/s, est. speed input: 9162.84 toks/s, output: 5855.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:53<00:01, 122.15it/s, est. speed input: 9307.18 toks/s, output: 5970.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:53<00:01, 132.08it/s, est. speed input: 9461.17 toks/s, output: 6097.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:53<00:01, 132.23it/s, est. speed input: 9572.46 toks/s, output: 6199.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1125/1280 [00:54<00:01, 125.08it/s, est. speed input: 9680.16 toks/s, output: 6307.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:54<00:01, 113.03it/s, est. speed input: 9777.16 toks/s, output: 6388.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:54<00:01, 112.46it/s, est. speed input: 9884.36 toks/s, output: 6493.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:54<00:01, 100.59it/s, est. speed input: 9977.30 toks/s, output: 6585.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:54<00:01, 71.12it/s, est. speed input: 10040.91 toks/s, output: 6673.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:55<00:01, 76.59it/s, est. speed input: 10139.04 toks/s, output: 6773.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:55<00:00, 79.23it/s, est. speed input: 10205.30 toks/s, output: 6838.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:55<00:00, 69.84it/s, est. speed input: 10256.41 toks/s, output: 6934.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:55<00:00, 71.35it/s, est. speed input: 10317.41 toks/s, output: 7002.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:55<00:00, 71.04it/s, est. speed input: 10408.20 toks/s, output: 7102.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:55<00:00, 63.29it/s, est. speed input: 10456.11 toks/s, output: 7212.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:56<00:00, 45.42it/s, est. speed input: 10467.92 toks/s, output: 7264.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:57<00:00, 25.53it/s, est. speed input: 10402.11 toks/s, output: 7255.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:11<00:00, 25.53it/s, est. speed input: 10402.11 toks/s, output: 7255.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:12<00:00, 1.91it/s, est. speed input: 8267.32 toks/s, output: 5810.64 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:12<00:00, 17.71it/s, est. speed input: 8267.32 toks/s, output: 5810.64 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0002334449382033199, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -8.16340689198114e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.02180599607527256, 'actor/pg_clipfrac': 0.0017123287543654442, 'actor/ppo_kl': -0.0006738770753145218}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.10581865161657333, 'actor/pg_clipfrac': 0.001857010181993246, 'actor/ppo_kl': 0.00132915994618088}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.1250578612089157, 'actor/pg_clipfrac': 0.0005558643606491387, 'actor/ppo_kl': 0.0007804342894814909}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.2606758177280426, 'actor/pg_clipfrac': 0.0006234414177015424, 'actor/ppo_kl': 0.00045751038123853505}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.5664920806884766, 'actor/pg_clipfrac': 0.0005770340212620795, 'actor/ppo_kl': -0.0003278757503721863}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0005085685988888144, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005314053269103169}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.158409982919693, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005686126532964408}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.278815358877182, 'actor/pg_clipfrac': 0.0009487665956839919, 'actor/ppo_kl': 0.0013260922860354185}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.16171543300151825, 'actor/pg_clipfrac': 0.003853564616292715, 'actor/ppo_kl': 0.0012157013406977057}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.000491115904878825, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00026059692027047276}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.8354337811470032, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00022681086556985974}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0006843310547992587, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00025030822143889964}
[36m(Runner pid=3309020)[0m Step 68
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.241
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.035
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.016
[36m(Runner pid=3309020)[0m ppo_kl: 7.688831753789316e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.023
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.023
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.685
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.685
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1014395
[36m(Runner pid=3309020)[0m balanced_min: 1014395
[36m(Runner pid=3309020)[0m max: 1020359
[36m(Runner pid=3309020)[0m mean: 1014395.0
[36m(Runner pid=3309020)[0m min: 1008431
[36m(Runner pid=3309020)[0m minmax_diff: 11928
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 110.751
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.226
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.124
[36m(Runner pid=3309020)[0m throughput: 1181.336
[36m(Runner pid=3309020)[0m time_per_step: 858.684
[36m(Runner pid=3309020)[0m total_num_tokens: 2028790
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 693.0
[36m(Runner pid=3309020)[0m mean: 467.135
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1309.0
[36m(Runner pid=3309020)[0m mean: 325.361
[36m(Runner pid=3309020)[0m min: 71.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.371
[36m(Runner pid=3309020)[0m format: 0.998
[36m(Runner pid=3309020)[0m overall: 0.685
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.829924340571522e-05
[36m(Runner pid=3309020)[0m gen: 0.132
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.278
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.179
[36m(Runner pid=3309020)[0m gen: 109.82
[36m(Runner pid=3309020)[0m old: 88.597
[36m(Runner pid=3309020)[0m ref: 88.276
[36m(Runner pid=3309020)[0m reward: 6.345
[36m(Runner pid=3309020)[0m step: 858.684
[36m(Runner pid=3309020)[0m update_actor: 564.784
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 69; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.64 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:34:10 [executor_base.py:219] It took 0.338453 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.56 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.75 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:35:53 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:34:10 [executor_base.py:219] It took 0.331135 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:35:54 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:35:54 [executor_base.py:208] It took 0.327290 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:35:55 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:35:56 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:35:56 [executor_base.py:208] It took 0.328452 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00030619435710832477, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001241353340446949}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0005439277738332748, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009895258117467165}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.22472147643566132, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.03178859502077103, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00023311313998419791, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.334486722946167, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001112070749513805}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.515031635761261, 'actor/pg_clipfrac': 0.001336898421868682, 'actor/ppo_kl': 0.0005451880861073732}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.000375368312234059, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000758305483032018}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.1202564537525177, 'actor/pg_clipfrac': 0.0016313213855028152, 'actor/ppo_kl': 5.479349056258798e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.49800604581832886, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009232103475369513}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.06035003066062927, 'actor/pg_clipfrac': 0.0015999999595806003, 'actor/ppo_kl': -0.0014205077895894647}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.40755409002304077, 'actor/pg_clipfrac': 0.0020020019728690386, 'actor/ppo_kl': -0.0005091093480587006}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0003138592292089015, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00028281647246330976}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.01102221105247736, 'actor/pg_clipfrac': 0.0006930006784386933, 'actor/ppo_kl': 0.001790866837836802}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.11240577697753906, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 4.5669385144719854e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.0003060649032704532, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.2762168049812317, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001794232870452106}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00029436053591780365, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0015126055805012584}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00029933155747130513, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0009463160531595349}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.23253899812698364, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001272268476895988}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.2105334848165512, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 2.1311479940777645e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.9138495922088623, 'actor/pg_clipfrac': 0.003144653979688883, 'actor/ppo_kl': 0.00023631929070688784}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4995006322860718, 'actor/pg_clipfrac': 0.002617801073938608, 'actor/ppo_kl': -0.00020527714514173567}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.14972348511219025, 'actor/pg_clipfrac': 0.0009276437922380865, 'actor/ppo_kl': 0.002220967784523964}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.00031870853854343295, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0025994088500738144}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.4717967212200165, 'actor/pg_clipfrac': 0.002114164875820279, 'actor/ppo_kl': 2.1312840544851497e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.2852627635002136, 'actor/pg_clipfrac': 0.0007627765298821032, 'actor/ppo_kl': -0.000852982746437192}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.07639780640602112, 'actor/pg_clipfrac': 0.005891016218811274, 'actor/ppo_kl': -0.004306461662054062}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00018366056610830128, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008746787789277732}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.4077087342739105, 'actor/pg_clipfrac': 0.0017730495892465115, 'actor/ppo_kl': 9.949494415195659e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.02429887466132641, 'actor/pg_clipfrac': 0.0017652250826358795, 'actor/ppo_kl': -0.0002556941471993923}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.11948651820421219, 'actor/pg_clipfrac': 0.0013661201810464263, 'actor/ppo_kl': 0.0012840860290452838}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.652000904083252, 'actor/pg_clipfrac': 0.00025278059183619916, 'actor/ppo_kl': 0.0001544822152936831}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.08847648650407791, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0018722619861364365}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.6827486157417297, 'actor/pg_clipfrac': 0.0010235414374619722, 'actor/ppo_kl': -0.0015369715401902795}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0006725142011418939, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007434005383402109}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.3852376639842987, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00024004923761822283}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.2750755548477173, 'actor/pg_clipfrac': 0.003997335210442543, 'actor/ppo_kl': -0.000104622078652028}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.14620985090732574, 'actor/pg_clipfrac': 0.0009587727836333215, 'actor/ppo_kl': -0.0003893441753461957}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00042031999328173697, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008775597088970244}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.15952086448669434, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012673134915530682}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.23616564273834229, 'actor/pg_clipfrac': 0.0022156573832035065, 'actor/ppo_kl': 0.000737269117962569}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.23340609669685364, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0025271514896303415}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.2850872278213501, 'actor/pg_clipfrac': 0.004968944005668163, 'actor/ppo_kl': 0.0015125368954613805}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.4075223207473755, 'actor/pg_clipfrac': 0.002290950855240226, 'actor/ppo_kl': 7.405450560327154e-06}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:22<1:34:08, 4.43s/it, est. speed input: 103.39 toks/s, output: 25.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<48:35, 2.30s/it, est. speed input: 176.03 toks/s, output: 45.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:29<32:43, 1.55s/it, est. speed input: 235.42 toks/s, output: 64.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:30<21:52, 1.04s/it, est. speed input: 300.44 toks/s, output: 87.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:31<15:18, 1.37it/s, est. speed input: 361.64 toks/s, output: 109.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:32<11:12, 1.86it/s, est. speed input: 426.41 toks/s, output: 130.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:33<08:13, 2.52it/s, est. speed input: 490.57 toks/s, output: 152.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:33<05:52, 3.52it/s, est. speed input: 554.79 toks/s, output: 177.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:35<06:23, 3.22it/s, est. speed input: 588.56 toks/s, output: 189.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:35<04:36, 4.45it/s, est. speed input: 651.45 toks/s, output: 205.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:35<04:03, 5.03it/s, est. speed input: 704.51 toks/s, output: 225.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:36<02:22, 8.55it/s, est. speed input: 825.24 toks/s, output: 271.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<02:09, 9.35it/s, est. speed input: 882.45 toks/s, output: 287.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:37<02:04, 9.71it/s, est. speed input: 933.75 toks/s, output: 309.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 85/1280 [00:37<01:21, 14.58it/s, est. speed input: 1050.93 toks/s, output: 349.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<01:19, 15.00it/s, est. speed input: 1103.02 toks/s, output: 369.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:38<01:25, 13.78it/s, est. speed input: 1149.39 toks/s, output: 388.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:38<00:56, 20.62it/s, est. speed input: 1262.66 toks/s, output: 437.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<00:55, 21.13it/s, est. speed input: 1318.04 toks/s, output: 458.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:38<00:53, 21.61it/s, est. speed input: 1368.85 toks/s, output: 475.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:48, 24.03it/s, est. speed input: 1424.10 toks/s, output: 492.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:38<00:28, 40.66it/s, est. speed input: 1593.95 toks/s, output: 568.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:39<00:34, 32.97it/s, est. speed input: 1699.41 toks/s, output: 610.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:39<00:47, 23.61it/s, est. speed input: 1778.77 toks/s, output: 651.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:40<00:44, 25.32it/s, est. speed input: 1876.32 toks/s, output: 688.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:40<00:31, 34.67it/s, est. speed input: 2033.60 toks/s, output: 757.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:40<00:28, 38.38it/s, est. speed input: 2138.11 toks/s, output: 810.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:40<00:32, 33.62it/s, est. speed input: 2183.25 toks/s, output: 829.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:41<00:31, 34.84it/s, est. speed input: 2232.20 toks/s, output: 852.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:41<00:29, 36.07it/s, est. speed input: 2285.89 toks/s, output: 874.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:41<00:21, 49.47it/s, est. speed input: 2445.02 toks/s, output: 937.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:41<00:28, 37.39it/s, est. speed input: 2534.35 toks/s, output: 986.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:42<00:38, 27.27it/s, est. speed input: 2565.13 toks/s, output: 1002.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:42<00:35, 29.55it/s, est. speed input: 2614.78 toks/s, output: 1022.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:42<00:40, 25.58it/s, est. speed input: 2653.47 toks/s, output: 1047.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:42<00:38, 26.46it/s, est. speed input: 2698.66 toks/s, output: 1071.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:43<00:40, 25.17it/s, est. speed input: 2779.70 toks/s, output: 1114.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:43<00:35, 28.21it/s, est. speed input: 2826.01 toks/s, output: 1126.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:43<00:34, 29.02it/s, est. speed input: 2870.13 toks/s, output: 1152.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:43<00:31, 32.33it/s, est. speed input: 2918.72 toks/s, output: 1174.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:43<00:30, 32.38it/s, est. speed input: 3003.80 toks/s, output: 1221.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:43<00:28, 35.22it/s, est. speed input: 3045.82 toks/s, output: 1248.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:44<00:18, 51.95it/s, est. speed input: 3194.11 toks/s, output: 1332.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:44<00:17, 55.88it/s, est. speed input: 3291.50 toks/s, output: 1391.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:44<00:16, 58.99it/s, est. speed input: 3380.12 toks/s, output: 1443.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:44<00:12, 78.03it/s, est. speed input: 3536.33 toks/s, output: 1530.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:44<00:16, 57.61it/s, est. speed input: 3615.55 toks/s, output: 1578.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:44<00:15, 60.45it/s, est. speed input: 3702.71 toks/s, output: 1636.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:45<00:17, 50.94it/s, est. speed input: 3783.81 toks/s, output: 1675.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:45<00:23, 37.59it/s, est. speed input: 3851.16 toks/s, output: 1720.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 385/1280 [00:45<00:24, 35.83it/s, est. speed input: 3891.47 toks/s, output: 1749.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:45<00:20, 43.79it/s, est. speed input: 3981.86 toks/s, output: 1803.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:46<00:18, 48.00it/s, est. speed input: 4075.12 toks/s, output: 1854.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:46<00:10, 80.52it/s, est. speed input: 4319.53 toks/s, output: 2002.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:46<00:10, 80.69it/s, est. speed input: 4407.52 toks/s, output: 2061.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:46<00:11, 69.83it/s, est. speed input: 4505.34 toks/s, output: 2113.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:46<00:15, 52.59it/s, est. speed input: 4576.32 toks/s, output: 2155.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:47<00:12, 62.52it/s, est. speed input: 4705.37 toks/s, output: 2243.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:47<00:12, 63.08it/s, est. speed input: 4786.50 toks/s, output: 2293.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▊ | 495/1280 [00:47<00:12, 60.79it/s, est. speed input: 4864.71 toks/s, output: 2336.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:47<00:09, 80.05it/s, est. speed input: 5087.17 toks/s, output: 2464.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:47<00:08, 92.24it/s, est. speed input: 5225.14 toks/s, output: 2535.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:47<00:06, 116.37it/s, est. speed input: 5454.15 toks/s, output: 2651.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 575/1280 [00:47<00:06, 108.45it/s, est. speed input: 5578.06 toks/s, output: 2727.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:48<00:06, 110.74it/s, est. speed input: 5704.28 toks/s, output: 2817.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:48<00:06, 101.03it/s, est. speed input: 5914.70 toks/s, output: 2943.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:48<00:06, 105.72it/s, est. speed input: 6039.34 toks/s, output: 3031.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 655/1280 [00:48<00:05, 122.39it/s, est. speed input: 6255.02 toks/s, output: 3169.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:49<00:06, 87.65it/s, est. speed input: 6395.56 toks/s, output: 3276.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:49<00:06, 95.16it/s, est. speed input: 6514.68 toks/s, output: 3360.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:49<00:05, 99.94it/s, est. speed input: 6640.55 toks/s, output: 3450.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▋ | 720/1280 [00:49<00:05, 104.87it/s, est. speed input: 6761.12 toks/s, output: 3539.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 735/1280 [00:49<00:04, 109.68it/s, est. speed input: 6884.49 toks/s, output: 3615.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:49<00:05, 98.93it/s, est. speed input: 7002.12 toks/s, output: 3688.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:49<00:05, 101.81it/s, est. speed input: 7119.51 toks/s, output: 3759.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:49<00:04, 110.46it/s, est. speed input: 7247.65 toks/s, output: 3871.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:50<00:05, 91.68it/s, est. speed input: 7425.97 toks/s, output: 4012.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 820/1280 [00:50<00:05, 81.75it/s, est. speed input: 7520.80 toks/s, output: 4091.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 830/1280 [00:50<00:05, 82.31it/s, est. speed input: 7592.97 toks/s, output: 4155.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 840/1280 [00:50<00:06, 72.33it/s, est. speed input: 7663.38 toks/s, output: 4195.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▋ | 850/1280 [00:51<00:06, 61.79it/s, est. speed input: 7718.82 toks/s, output: 4238.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:51<00:06, 66.73it/s, est. speed input: 7792.40 toks/s, output: 4280.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 870/1280 [00:51<00:05, 71.39it/s, est. speed input: 7860.54 toks/s, output: 4343.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:51<00:05, 73.43it/s, est. speed input: 7934.20 toks/s, output: 4411.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 905/1280 [00:51<00:03, 109.34it/s, est. speed input: 8158.00 toks/s, output: 4564.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 925/1280 [00:51<00:03, 106.35it/s, est. speed input: 8306.98 toks/s, output: 4706.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:51<00:01, 165.73it/s, est. speed input: 8675.11 toks/s, output: 5027.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:52<00:02, 130.84it/s, est. speed input: 8837.36 toks/s, output: 5165.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:52<00:01, 147.25it/s, est. speed input: 9042.69 toks/s, output: 5318.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:52<00:01, 133.52it/s, est. speed input: 9182.64 toks/s, output: 5462.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:52<00:01, 129.34it/s, est. speed input: 9306.18 toks/s, output: 5578.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:52<00:01, 130.41it/s, est. speed input: 9419.41 toks/s, output: 5705.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1080/1280 [00:52<00:01, 114.24it/s, est. speed input: 9511.30 toks/s, output: 5793.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 1095/1280 [00:53<00:01, 116.07it/s, est. speed input: 9627.89 toks/s, output: 5898.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 87%|████████▋ | 1110/1280 [00:53<00:02, 77.30it/s, est. speed input: 9692.32 toks/s, output: 5974.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:53<00:02, 70.60it/s, est. speed input: 9746.27 toks/s, output: 6045.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 1140/1280 [00:53<00:01, 74.47it/s, est. speed input: 9873.76 toks/s, output: 6178.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:54<00:02, 59.99it/s, est. speed input: 9903.84 toks/s, output: 6227.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:54<00:01, 70.00it/s, est. speed input: 10037.47 toks/s, output: 6376.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1180/1280 [00:54<00:01, 64.09it/s, est. speed input: 10080.15 toks/s, output: 6427.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:54<00:01, 65.37it/s, est. speed input: 10138.53 toks/s, output: 6510.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:55<00:01, 52.94it/s, est. speed input: 10184.73 toks/s, output: 6584.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:55<00:01, 47.48it/s, est. speed input: 10219.35 toks/s, output: 6633.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:55<00:01, 47.64it/s, est. speed input: 10265.95 toks/s, output: 6723.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:55<00:00, 49.64it/s, est. speed input: 10322.56 toks/s, output: 6806.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:55<00:00, 53.29it/s, est. speed input: 10369.59 toks/s, output: 6905.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:56<00:00, 35.15it/s, est. speed input: 10349.95 toks/s, output: 6946.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:56<00:00, 37.19it/s, est. speed input: 10388.03 toks/s, output: 7011.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:56<00:00, 30.99it/s, est. speed input: 10377.21 toks/s, output: 7038.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [00:59<00:00, 7.66it/s, est. speed input: 9951.90 toks/s, output: 6762.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:04<00:00, 2.96it/s, est. speed input: 9161.95 toks/s, output: 6255.16 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:04<00:00, 19.70it/s, est. speed input: 9161.95 toks/s, output: 6255.16 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.6180446743965149, 'actor/pg_clipfrac': 0.0008510638144798577, 'actor/ppo_kl': -0.0011058986419811845}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.07674980163574219, 'actor/pg_clipfrac': 0.0009624639060348272, 'actor/ppo_kl': 0.0002004258130909875}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.3557789921760559, 'actor/pg_clipfrac': 0.006870228797197342, 'actor/ppo_kl': -9.755578503245488e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.3050585091114044, 'actor/pg_clipfrac': 0.0009671180159784853, 'actor/ppo_kl': -0.0011537540704011917}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.0002692870039027184, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001458960585296154}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.12323568016290665, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010091307340189815}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.7040833234786987, 'actor/pg_clipfrac': 0.002938295714557171, 'actor/ppo_kl': 0.0005050625768490136}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00048116836114786565, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002238620538264513}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.08063016086816788, 'actor/pg_clipfrac': 0.0022271715570241213, 'actor/ppo_kl': -0.0011565776076167822}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.040761277079582214, 'actor/pg_clipfrac': 0.00186741363722831, 'actor/ppo_kl': -0.001069757272489369}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00025995110627263784, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004764833429362625}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.2505556046962738, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000966075633186847}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.49296560883522034, 'actor/pg_clipfrac': 0.000554631173145026, 'actor/ppo_kl': 9.756780855241232e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.0003070920647587627, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005270303227007389}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.16722257435321808, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0016908814432099462}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00038756662979722023, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 3.0404953577090055e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.20395661890506744, 'actor/pg_clipfrac': 0.004098360426723957, 'actor/ppo_kl': 0.0018420896958559752}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.9202437996864319, 'actor/pg_clipfrac': 0.000650618108920753, 'actor/ppo_kl': -0.0010159467346966267}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.1766483634710312, 'actor/pg_clipfrac': 0.0029739777091890574, 'actor/ppo_kl': 0.0019662368576973677}
[36m(Runner pid=3309020)[0m Step 69
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.247
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.022
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.019
[36m(Runner pid=3309020)[0m ppo_kl: 5.2558220993326185e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.028
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.028
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.677
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.677
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1010875
[36m(Runner pid=3309020)[0m balanced_min: 1010875
[36m(Runner pid=3309020)[0m max: 1017560
[36m(Runner pid=3309020)[0m mean: 1010875.0
[36m(Runner pid=3309020)[0m min: 1004190
[36m(Runner pid=3309020)[0m minmax_diff: 13370
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 111.69
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.226
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 76.996
[36m(Runner pid=3309020)[0m mfu_actor: 0.124
[36m(Runner pid=3309020)[0m throughput: 1166.669
[36m(Runner pid=3309020)[0m time_per_step: 866.462
[36m(Runner pid=3309020)[0m total_num_tokens: 2021750
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 687.0
[36m(Runner pid=3309020)[0m mean: 465.98
[36m(Runner pid=3309020)[0m min: 411.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 3305.0
[36m(Runner pid=3309020)[0m mean: 323.766
[36m(Runner pid=3309020)[0m min: 59.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.357
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.677
[36m(Runner pid=3309020)[0m tag_reward: 0.999
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 7.835439395625202e-05
[36m(Runner pid=3309020)[0m gen: 0.144
[36m(Runner pid=3309020)[0m old: 0.043
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.007
[36m(Runner pid=3309020)[0m update_actor: 0.279
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.158
[36m(Runner pid=3309020)[0m gen: 119.323
[36m(Runner pid=3309020)[0m old: 87.765
[36m(Runner pid=3309020)[0m ref: 88.495
[36m(Runner pid=3309020)[0m reward: 6.123
[36m(Runner pid=3309020)[0m step: 866.462
[36m(Runner pid=3309020)[0m update_actor: 563.971
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 70; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:48:43 [executor_base.py:219] It took 0.342434 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.65 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.75 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:48:43 [executor_base.py:219] It took 0.342474 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:50:21 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:50:21 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 14:50:21 [executor_base.py:208] It took 0.325760 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.83 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:50:42 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:50:43 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 14:50:43 [executor_base.py:208] It took 0.328367 seconds to fall asleep.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.10275527089834213, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.07379116863012314, 'actor/pg_clipfrac': 0.0009756097570061684, 'actor/ppo_kl': 0.0013158528599888086}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.5359496474266052, 'actor/pg_clipfrac': 0.001406469731591642, 'actor/ppo_kl': 0.0005474734352901578}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.3016120195388794, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.371919184923172, 'actor/pg_clipfrac': 0.0015186028322204947, 'actor/ppo_kl': 0.0011192907113581896}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.4990483522415161, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003733066550921649}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.45784425735473633, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00029290205566212535, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.25493258237838745, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 8.93967371666804e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.0004049072740599513, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005630046362057328}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.334679514169693, 'actor/pg_clipfrac': 0.005063291173428297, 'actor/ppo_kl': 0.0020286221988499165}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.8606234192848206, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006179549382068217}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002577258856035769, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0014048517914488912}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.3022419512271881, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00019953539595007896, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0008995148236863315}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00024097875575535, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00016048646648414433}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.722145140171051, 'actor/pg_clipfrac': 0.004127358552068472, 'actor/ppo_kl': -0.0008510621264576912}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.07922367751598358, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012654614401981235}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.11029583215713501, 'actor/pg_clipfrac': 0.000596302910707891, 'actor/ppo_kl': -0.0015017896657809615}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.12873031198978424, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 2.40359986491967e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002988929918501526, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0001376619766233489}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.5561613440513611, 'actor/pg_clipfrac': 0.0009578543831594288, 'actor/ppo_kl': 0.0003765899164136499}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.3386552333831787, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0010452679125592113}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.5131826400756836, 'actor/pg_clipfrac': 0.0008810572908259928, 'actor/ppo_kl': -0.001355573651380837}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.32217681407928467, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005000803503207862}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.1336188167333603, 'actor/pg_clipfrac': 0.0008382229716517031, 'actor/ppo_kl': -0.0007514345925301313}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.17918981611728668, 'actor/pg_clipfrac': 0.0012771391775459051, 'actor/ppo_kl': -0.0015874719247221947}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2599148750305176, 'actor/pg_clipfrac': 0.0007662835414521396, 'actor/ppo_kl': 0.00035738159203901887}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.21072450280189514, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0003151493438053876}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.13806983828544617, 'actor/pg_clipfrac': 0.001251564477570355, 'actor/ppo_kl': 4.416263891471317e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.4444621205329895, 'actor/pg_clipfrac': 0.0029411765281111, 'actor/ppo_kl': -0.0013084622332826257}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.00017978978576138616, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000591080344747752}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.5179910063743591, 'actor/pg_clipfrac': 0.00223380490206182, 'actor/ppo_kl': 2.5643401386332698e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.08793187141418457, 'actor/pg_clipfrac': 0.0013661201810464263, 'actor/ppo_kl': 0.00039757275953888893}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.24921783804893494, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00192058936227113}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.14576789736747742, 'actor/pg_clipfrac': 0.0012853470398113132, 'actor/ppo_kl': 0.0008689076057635248}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.15287451446056366, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003104989882558584}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.254698783159256, 'actor/pg_clipfrac': 0.0006365372100844979, 'actor/ppo_kl': 0.00012250860163476318}
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/377 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 1/377 [00:14<1:30:58, 14.52s/it, est. speed input: 31.27 toks/s, output: 4.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 2/377 [00:15<42:43, 6.84s/it, est. speed input: 57.71 toks/s, output: 10.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 3/377 [00:16<23:43, 3.81s/it, est. speed input: 84.70 toks/s, output: 16.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 6/377 [00:16<08:05, 1.31s/it, est. speed input: 168.30 toks/s, output: 37.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 9/377 [00:16<04:17, 1.43it/s, est. speed input: 252.62 toks/s, output: 57.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 12/377 [00:16<02:45, 2.21it/s, est. speed input: 330.72 toks/s, output: 79.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 14/377 [00:16<02:05, 2.90it/s, est. speed input: 382.70 toks/s, output: 94.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 16/377 [00:17<01:37, 3.71it/s, est. speed input: 434.51 toks/s, output: 109.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 18/377 [00:17<01:15, 4.76it/s, est. speed input: 484.40 toks/s, output: 124.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 22/377 [00:17<00:46, 7.70it/s, est. speed input: 587.86 toks/s, output: 157.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 25/377 [00:17<00:39, 8.90it/s, est. speed input: 659.62 toks/s, output: 180.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 33/377 [00:17<00:21, 16.34it/s, est. speed input: 873.69 toks/s, output: 247.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 37/377 [00:17<00:17, 18.90it/s, est. speed input: 969.93 toks/s, output: 281.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 42/377 [00:17<00:14, 22.91it/s, est. speed input: 1091.51 toks/s, output: 324.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 50/377 [00:18<00:10, 31.71it/s, est. speed input: 1289.48 toks/s, output: 395.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 55/377 [00:18<00:10, 31.86it/s, est. speed input: 1408.71 toks/s, output: 437.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 63/377 [00:18<00:09, 32.20it/s, est. speed input: 1591.60 toks/s, output: 506.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 67/377 [00:18<00:09, 32.38it/s, est. speed input: 1681.09 toks/s, output: 543.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 75/377 [00:18<00:07, 40.25it/s, est. speed input: 1870.03 toks/s, output: 617.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 80/377 [00:18<00:07, 40.64it/s, est. speed input: 1979.53 toks/s, output: 665.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 86/377 [00:19<00:07, 38.22it/s, est. speed input: 2103.40 toks/s, output: 720.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 91/377 [00:19<00:07, 37.08it/s, est. speed input: 2210.46 toks/s, output: 768.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 95/377 [00:19<00:07, 36.44it/s, est. speed input: 2292.59 toks/s, output: 805.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 101/377 [00:19<00:06, 40.53it/s, est. speed input: 2423.03 toks/s, output: 865.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 107/377 [00:19<00:06, 43.88it/s, est. speed input: 2550.82 toks/s, output: 926.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 116/377 [00:19<00:04, 53.74it/s, est. speed input: 2755.33 toks/s, output: 1020.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 122/377 [00:19<00:05, 47.59it/s, est. speed input: 2871.95 toks/s, output: 1079.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 127/377 [00:19<00:05, 45.78it/s, est. speed input: 2974.22 toks/s, output: 1129.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 132/377 [00:20<00:05, 44.51it/s, est. speed input: 3073.86 toks/s, output: 1181.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 137/377 [00:20<00:05, 43.77it/s, est. speed input: 3171.19 toks/s, output: 1232.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 144/377 [00:20<00:04, 48.20it/s, est. speed input: 3312.50 toks/s, output: 1308.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 151/377 [00:20<00:04, 51.78it/s, est. speed input: 3453.34 toks/s, output: 1384.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 163/377 [00:20<00:03, 66.97it/s, est. speed input: 3716.02 toks/s, output: 1524.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 170/377 [00:20<00:03, 65.82it/s, est. speed input: 3856.94 toks/s, output: 1603.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 178/377 [00:20<00:02, 68.18it/s, est. speed input: 4015.25 toks/s, output: 1695.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 185/377 [00:20<00:03, 58.37it/s, est. speed input: 4141.38 toks/s, output: 1771.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████▏ | 194/377 [00:20<00:02, 65.93it/s, est. speed input: 4327.91 toks/s, output: 1881.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 201/377 [00:21<00:03, 57.99it/s, est. speed input: 4452.14 toks/s, output: 1959.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 208/377 [00:21<00:02, 58.45it/s, est. speed input: 4581.42 toks/s, output: 2043.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 215/377 [00:21<00:03, 53.87it/s, est. speed input: 4703.84 toks/s, output: 2126.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 221/377 [00:21<00:03, 46.69it/s, est. speed input: 4798.00 toks/s, output: 2193.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 234/377 [00:21<00:02, 62.60it/s, est. speed input: 5060.28 toks/s, output: 2367.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 65%|██████▍ | 244/377 [00:21<00:02, 66.29it/s, est. speed input: 5248.04 toks/s, output: 2497.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 252/377 [00:21<00:01, 68.54it/s, est. speed input: 5394.25 toks/s, output: 2604.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 260/377 [00:22<00:01, 64.76it/s, est. speed input: 5536.06 toks/s, output: 2706.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 267/377 [00:22<00:01, 60.77it/s, est. speed input: 5654.75 toks/s, output: 2797.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 274/377 [00:22<00:01, 60.92it/s, est. speed input: 5777.77 toks/s, output: 2893.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 282/377 [00:22<00:01, 64.13it/s, est. speed input: 5916.95 toks/s, output: 3005.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 293/377 [00:22<00:01, 74.68it/s, est. speed input: 6127.39 toks/s, output: 3168.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|███████▉ | 301/377 [00:22<00:01, 62.37it/s, est. speed input: 6245.10 toks/s, output: 3276.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 309/377 [00:22<00:01, 66.05it/s, est. speed input: 6389.31 toks/s, output: 3398.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 317/377 [00:22<00:00, 67.69it/s, est. speed input: 6525.01 toks/s, output: 3520.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▌ | 325/377 [00:23<00:00, 52.80it/s, est. speed input: 6623.32 toks/s, output: 3627.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▉ | 335/377 [00:23<00:00, 62.11it/s, est. speed input: 6797.05 toks/s, output: 3793.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 342/377 [00:23<00:00, 39.62it/s, est. speed input: 6833.27 toks/s, output: 3865.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 348/377 [00:23<00:00, 31.55it/s, est. speed input: 6862.64 toks/s, output: 3937.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▎| 353/377 [00:24<00:00, 25.14it/s, est. speed input: 6863.59 toks/s, output: 3988.65 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 357/377 [00:24<00:01, 19.42it/s, est. speed input: 6833.18 toks/s, output: 4017.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 360/377 [00:25<00:01, 16.23it/s, est. speed input: 6800.79 toks/s, output: 4038.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 363/377 [00:25<00:00, 14.80it/s, est. speed input: 6783.53 toks/s, output: 4070.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 365/377 [00:25<00:00, 13.25it/s, est. speed input: 6760.94 toks/s, output: 4085.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 367/377 [00:26<00:01, 8.36it/s, est. speed input: 6635.69 toks/s, output: 4044.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 369/377 [00:26<00:01, 5.41it/s, est. speed input: 6460.29 toks/s, output: 3976.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 370/377 [00:30<00:04, 1.48it/s, est. speed input: 5653.50 toks/s, output: 3513.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 371/377 [00:51<00:22, 3.83s/it, est. speed input: 3397.59 toks/s, output: 2180.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▊| 372/377 [01:01<00:24, 4.89s/it, est. speed input: 2874.18 toks/s, output: 1919.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 373/377 [01:04<00:18, 4.65s/it, est. speed input: 2716.15 toks/s, output: 1892.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 374/377 [01:05<00:11, 3.80s/it, est. speed input: 2684.74 toks/s, output: 1949.12 toks/s]
Processed prompts: 100%|██████████| 377/377 [01:05<00:00, 5.72it/s, est. speed input: 2705.21 toks/s, output: 2198.32 toks/s]
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0003108362143393606, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00019498921756166965}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002173793618567288, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00015013835218269378}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.280358225107193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00078067637514323}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 1.2696963548660278, 'actor/pg_clipfrac': 0.0001645548763917759, 'actor/ppo_kl': -0.0002590576186776161}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.210089311003685, 'actor/pg_clipfrac': 0.0009751340840011835, 'actor/ppo_kl': 0.00013455409498419613}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.0002905476139858365, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013216930674389005}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00022292677022051066, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 8.093118958640844e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.08757178485393524, 'actor/pg_clipfrac': 0.0008431703317910433, 'actor/ppo_kl': 0.002368425251916051}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.2147664576768875, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0008842780371196568}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.6096621155738831, 'actor/pg_clipfrac': 0.0006385695887729526, 'actor/ppo_kl': -0.0004201465053483844}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00023046800924930722, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009529074304737151}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.39856815338134766, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004982570535503328}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00021497740817721933, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00020566312014125288}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0002794096653815359, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00013464670337270945}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.8515476584434509, 'actor/pg_clipfrac': 0.002583979396149516, 'actor/ppo_kl': 0.0014884737320244312}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0005512831849046052, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 9.594758012099192e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.08658574521541595, 'actor/pg_clipfrac': 0.0007434944272972643, 'actor/ppo_kl': 0.0003580767079256475}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0001950943551491946, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005160800646990538}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.19895000755786896, 'actor/pg_clipfrac': 0.0005820721853524446, 'actor/ppo_kl': -0.0013955263420939445}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.16697566211223602, 'actor/pg_clipfrac': 0.0025873221457004547, 'actor/ppo_kl': -0.0006642655935138464}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.12684011459350586, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00020507254521362484}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.2596057951450348, 'actor/pg_clipfrac': 0.00038895371835678816, 'actor/ppo_kl': 0.0013232555938884616}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.2553633451461792, 'actor/pg_clipfrac': 0.0010940919164568186, 'actor/ppo_kl': -9.058862633537501e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.8957974314689636, 'actor/pg_clipfrac': 0.0026298488955944777, 'actor/ppo_kl': -0.0004066477413289249}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.03845326974987984, 'actor/pg_clipfrac': 0.0020215632393956184, 'actor/ppo_kl': 0.0012245358666405082}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.20939549803733826, 'actor/pg_clipfrac': 0.0009569377871230245, 'actor/ppo_kl': 0.00025334060774184763}
[36m(Runner pid=3309020)[0m =====Sample VLM data=====
[36m(Runner pid=3309020)[0m prompt: ['<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>As shown in the figure, in triangle ABC, it is known that angle A = 80.0, angle B = 60.0, DE parallel BC, then the size of angle CED is ()<|im_end|>\n<|im_start|>assistant\n']
[36m(Runner pid=3309020)[0m images: [[]]
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:03:58 [executor_base.py:219] It took 0.338710 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.00 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 50.08 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:05:48 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:03:58 [executor_base.py:219] It took 0.339209 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:05:48 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 9.16 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:05:48 [executor_base.py:208] It took 0.325906 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 9.16 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:05:55 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:05:55 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 10.10 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:05:55 [executor_base.py:208] It took 0.329829 seconds to fall asleep.
[36m(Runner pid=3309020)[0m validation generation end
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the perimeter of triangle ABC is 16.0. Point D is the midpoint of the AB, BD = 2.0, passing point D is the vertical line l of AB, and E is any point on l, then the minimum perimeter of triangle AEC is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo find the minimum perimeter of triangle AEC, we need to consider the properties of the triangle and the given conditions. Since D is the midpoint of AB, AD = DB = 2.0. The line l is the vertical line passing through D, and E is any point on l. The perimeter of triangle AEC is the sum of the lengths of AE, EC, and AC. To minimize this perimeter, we need to consider the geometric properties and the fact that the shortest distance from a point to a line is a perpendicular distance.\n\nGiven that E is on the line l and D is the midpoint of AB, the minimum perimeter of triangle AEC will occur when E is directly above or below D on the line l. This means that the shortest distance from A to E is the same as the distance from D to E, and the same applies to C. Therefore, the minimum perimeter of triangle AEC is the sum of the lengths of AD and DC, which is equal to AB.\n\nSince the perimeter of triangle ABC is 16.0 and D is the midpoint of AB, we have AB = 2 * AD = 2 * 2 = 4. Therefore, the minimum perimeter of triangle AEC is the same as the length of AB, which is 4.\n\nHowever, since E is on the line l and D is the midpoint of AB, the correct approach is to consider the total length of the perimeter of triangle AEC as the sum of the lengths of AE, EC, and AC. The minimum perimeter occurs when E is directly above or below D on the line l, making the shortest distance from A to E and from C to E equal to AD and DC respectively. Therefore, the minimum perimeter of triangle AEC is the same as the length of AB, which is 4.\n\nBut since the problem gives us the perimeter of triangle ABC as 16.0 and D is the midpoint of AB, the correct minimum perimeter of triangle AEC is actually the sum of the lengths of AD, DE, and EC, which is the same as the length of AB, but considering the correct geometric properties, it simplifies to the length of AB, which is 4.\n\nTherefore, the minimum perimeter of triangle AEC is 4.\n\n\n4.0\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['12']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, in Rttriangle ABC, angle BAC = 90.0, rotate triangle ABC clockwise around point A by 90.0 to obtain triangle AB′C′ (the corresponding point of point B is point B′, and the corresponding point of point C is point C ′), connect CC′. If angle CC′B′ = 32.0, then the size of angle AC′B′ is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ["\nGiven that triangle ABC is rotated clockwise around point A by 90 degrees to obtain triangle AB'C', we know that angle BAC is 90 degrees. Since the rotation is by 90 degrees, angle B'AC' is also 90 degrees. The angle CC'B' is given as 32 degrees. Therefore, angle AC'B' can be found by subtracting angle CC'B' from 90 degrees, because angle AC'B' is the same as angle BAC' minus angle CC'B'. This gives us angle AC'B' = 90 - 32 = 58 degrees.\n\n\nThe size of angle AC'B' is 58 degrees.\n"]
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['13°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, the vertices A, B, and D of parallelogram ABCD are on circle O, and the vertex C is on the diameter BE of circle O, angle ADC = 54.0, connect AE, then the degree of angle AEB is ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo solve this problem, we need to use properties of circles and parallelograms. In a circle, the opposite angles of a parallelogram are equal, and the angle subtended by a diameter is a right angle (90 degrees).\n\nGiven that ABCD is a parallelogram, we know that opposite angles are equal. Therefore, angle ABC = angle ADC = 54 degrees.\n\nSince BE is the diameter of the circle, angle AEB is a right angle (90 degrees) because any angle subtended by a diameter in a circle is a right angle.\n\nNow, we need to find angle AEB. Since angle AEB is a right angle (90 degrees), and we know angle ABC = 54 degrees, we can subtract this from 90 degrees to find angle AEB.\n\nTherefore, angle AEB = 90 degrees - 54 degrees = 36 degrees.\n\nSo, the degree of angle AEB is 36 degrees.\n\n\n36\n']
[36m(Runner pid=3309020)[0m [score] [1.0]
[36m(Runner pid=3309020)[0m [ground_truth] ['36°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, AB is the diameter of circle O, point C is on the extended line of AB, CD is tangent to circle O, and the tangent point is D. If angle A = 35.0, then angle C = ()\nassistant\n']
[36m(Runner pid=3309020)[0m [output] ['\nTo solve for angle C, we need to use some properties of circles and tangents. Since AB is the diameter of circle O, angle ADB is a right angle (90 degrees) because any angle subtended by a diameter in a semicircle is a right angle. Given that angle A is 35 degrees, we can find angle DAB by subtracting angle A from 90 degrees. This gives us angle DAB = 90 - 35 = 55 degrees. Since CD is a tangent to the circle at point D, angle ODC is also 90 degrees. Therefore, angle C can be found by subtracting angle DAB from 90 degrees, which gives us angle C = 90 - 55 = 35 degrees.\n\n\n35 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['20°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m [prompt] ['system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags, and the final answer MUST BE enclosed within and tags.\nuser\nAs shown in the figure, circle O is the circumscribed circle of triangle ABC, AD is the diameter of circle O, and EA is the tangent of circle O. If angle EAC = 120.0, then the degree of angle ABC is ()\nassistant\n']
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:13<57:44, 2.72s/it, est. speed input: 167.07 toks/s, output: 23.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<55:56, 2.64s/it, est. speed input: 172.39 toks/s, output: 38.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<31:09, 1.48s/it, est. speed input: 254.09 toks/s, output: 62.49 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:33<29:54, 1.42s/it, est. speed input: 270.86 toks/s, output: 72.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:34<20:16, 1.03it/s, est. speed input: 327.94 toks/s, output: 97.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:35<14:15, 1.46it/s, est. speed input: 386.45 toks/s, output: 121.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:35<09:51, 2.10it/s, est. speed input: 445.21 toks/s, output: 147.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:35<07:12, 2.86it/s, est. speed input: 502.03 toks/s, output: 164.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:36<04:03, 5.05it/s, est. speed input: 628.75 toks/s, output: 209.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:36<03:10, 6.43it/s, est. speed input: 687.65 toks/s, output: 233.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▍ | 60/1280 [00:36<02:29, 8.16it/s, est. speed input: 753.19 toks/s, output: 255.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<01:52, 10.73it/s, est. speed input: 862.70 toks/s, output: 295.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▌ | 75/1280 [00:37<01:34, 12.75it/s, est. speed input: 922.59 toks/s, output: 319.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<01:19, 15.05it/s, est. speed input: 980.07 toks/s, output: 340.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 95/1280 [00:37<00:53, 22.20it/s, est. speed input: 1147.07 toks/s, output: 410.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:38<01:10, 16.64it/s, est. speed input: 1189.18 toks/s, output: 430.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:38<01:06, 17.70it/s, est. speed input: 1240.37 toks/s, output: 451.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<01:06, 17.66it/s, est. speed input: 1291.34 toks/s, output: 469.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:39<01:02, 18.64it/s, est. speed input: 1392.81 toks/s, output: 516.98 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:39<00:55, 20.89it/s, est. speed input: 1450.19 toks/s, output: 541.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:39<01:04, 17.74it/s, est. speed input: 1492.62 toks/s, output: 563.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:40<00:50, 22.50it/s, est. speed input: 1596.71 toks/s, output: 617.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:40<00:45, 24.70it/s, est. speed input: 1649.04 toks/s, output: 642.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:40<00:41, 26.93it/s, est. speed input: 1700.57 toks/s, output: 666.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:40<00:33, 33.24it/s, est. speed input: 1810.79 toks/s, output: 705.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:40<00:34, 32.34it/s, est. speed input: 1909.12 toks/s, output: 747.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▎ | 175/1280 [00:40<00:32, 33.90it/s, est. speed input: 1957.16 toks/s, output: 763.81 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:41<00:27, 39.73it/s, est. speed input: 2059.39 toks/s, output: 808.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:41<00:26, 40.62it/s, est. speed input: 2156.68 toks/s, output: 855.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 215/1280 [00:41<00:17, 59.68it/s, est. speed input: 2371.95 toks/s, output: 956.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 225/1280 [00:41<00:19, 53.93it/s, est. speed input: 2476.46 toks/s, output: 999.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:42<00:23, 43.66it/s, est. speed input: 2565.49 toks/s, output: 1042.58 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:42<00:29, 35.25it/s, est. speed input: 2697.20 toks/s, output: 1105.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:43<00:25, 39.88it/s, est. speed input: 2842.45 toks/s, output: 1187.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:43<00:24, 40.74it/s, est. speed input: 2890.61 toks/s, output: 1207.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:43<00:25, 38.79it/s, est. speed input: 2930.21 toks/s, output: 1234.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:43<00:20, 47.99it/s, est. speed input: 3025.54 toks/s, output: 1288.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 300/1280 [00:43<00:24, 40.09it/s, est. speed input: 3150.82 toks/s, output: 1353.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 310/1280 [00:44<00:33, 28.70it/s, est. speed input: 3210.85 toks/s, output: 1391.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 320/1280 [00:44<00:27, 34.35it/s, est. speed input: 3304.88 toks/s, output: 1443.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 26%|██▌ | 335/1280 [00:44<00:20, 45.93it/s, est. speed input: 3452.00 toks/s, output: 1526.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▊ | 365/1280 [00:45<00:14, 64.65it/s, est. speed input: 3740.94 toks/s, output: 1661.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 375/1280 [00:45<00:16, 56.09it/s, est. speed input: 3819.88 toks/s, output: 1703.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███ | 395/1280 [00:45<00:12, 68.71it/s, est. speed input: 3999.01 toks/s, output: 1809.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 405/1280 [00:45<00:12, 70.25it/s, est. speed input: 4093.60 toks/s, output: 1871.83 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:45<00:14, 59.51it/s, est. speed input: 4172.37 toks/s, output: 1908.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:46<00:12, 65.82it/s, est. speed input: 4352.51 toks/s, output: 2021.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:46<00:12, 68.59it/s, est. speed input: 4443.76 toks/s, output: 2070.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:46<00:09, 82.67it/s, est. speed input: 4632.64 toks/s, output: 2165.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:46<00:09, 88.17it/s, est. speed input: 4812.86 toks/s, output: 2248.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 505/1280 [00:46<00:07, 105.05it/s, est. speed input: 5001.36 toks/s, output: 2341.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:47<00:14, 53.13it/s, est. speed input: 5072.71 toks/s, output: 2382.24 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:47<00:16, 46.16it/s, est. speed input: 5134.31 toks/s, output: 2429.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 540/1280 [00:47<00:15, 48.41it/s, est. speed input: 5217.15 toks/s, output: 2490.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:48<00:13, 55.13it/s, est. speed input: 5301.82 toks/s, output: 2543.56 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 560/1280 [00:48<00:13, 51.46it/s, est. speed input: 5365.59 toks/s, output: 2598.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:48<00:14, 49.24it/s, est. speed input: 5439.31 toks/s, output: 2650.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:48<00:12, 55.04it/s, est. speed input: 5517.72 toks/s, output: 2702.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:48<00:12, 57.22it/s, est. speed input: 5596.53 toks/s, output: 2766.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 600/1280 [00:48<00:12, 56.48it/s, est. speed input: 5671.90 toks/s, output: 2808.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:49<00:10, 64.42it/s, est. speed input: 5794.97 toks/s, output: 2900.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 630/1280 [00:49<00:08, 74.13it/s, est. speed input: 5915.28 toks/s, output: 2982.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:49<00:08, 72.84it/s, est. speed input: 5988.88 toks/s, output: 3026.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 670/1280 [00:49<00:05, 113.09it/s, est. speed input: 6275.78 toks/s, output: 3208.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 690/1280 [00:49<00:05, 114.24it/s, est. speed input: 6434.00 toks/s, output: 3331.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:49<00:06, 94.08it/s, est. speed input: 6546.30 toks/s, output: 3414.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 725/1280 [00:50<00:05, 105.74it/s, est. speed input: 6722.39 toks/s, output: 3533.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 740/1280 [00:50<00:05, 94.51it/s, est. speed input: 6830.97 toks/s, output: 3610.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:50<00:04, 111.42it/s, est. speed input: 7002.02 toks/s, output: 3732.20 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:50<00:04, 117.75it/s, est. speed input: 7120.31 toks/s, output: 3831.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 62%|██████▏ | 790/1280 [00:50<00:03, 122.89it/s, est. speed input: 7246.48 toks/s, output: 3948.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:50<00:03, 121.84it/s, est. speed input: 7412.68 toks/s, output: 4066.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:51<00:05, 87.83it/s, est. speed input: 7515.06 toks/s, output: 4150.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:51<00:04, 91.76it/s, est. speed input: 7660.43 toks/s, output: 4279.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:51<00:04, 92.44it/s, est. speed input: 7771.08 toks/s, output: 4369.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 875/1280 [00:51<00:05, 68.00it/s, est. speed input: 7848.81 toks/s, output: 4444.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:51<00:04, 78.76it/s, est. speed input: 7964.94 toks/s, output: 4539.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|███████ | 900/1280 [00:52<00:04, 77.99it/s, est. speed input: 8050.16 toks/s, output: 4606.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████▏ | 915/1280 [00:52<00:04, 90.98it/s, est. speed input: 8164.48 toks/s, output: 4710.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 940/1280 [00:52<00:02, 117.76it/s, est. speed input: 8360.62 toks/s, output: 4863.16 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:52<00:02, 122.19it/s, est. speed input: 8470.32 toks/s, output: 4951.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:52<00:02, 123.30it/s, est. speed input: 8587.95 toks/s, output: 5050.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 985/1280 [00:52<00:02, 121.73it/s, est. speed input: 8701.95 toks/s, output: 5143.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:52<00:02, 107.87it/s, est. speed input: 8824.68 toks/s, output: 5238.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:52<00:01, 133.12it/s, est. speed input: 9024.44 toks/s, output: 5415.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:53<00:02, 100.52it/s, est. speed input: 9157.82 toks/s, output: 5527.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1060/1280 [00:53<00:02, 98.37it/s, est. speed input: 9261.72 toks/s, output: 5629.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▍ | 1075/1280 [00:53<00:01, 104.33it/s, est. speed input: 9369.31 toks/s, output: 5732.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:53<00:01, 98.06it/s, est. speed input: 9465.18 toks/s, output: 5858.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:53<00:01, 93.42it/s, est. speed input: 9567.09 toks/s, output: 5964.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1130/1280 [00:54<00:01, 112.28it/s, est. speed input: 9753.09 toks/s, output: 6127.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|█████████ | 1155/1280 [00:54<00:00, 137.18it/s, est. speed input: 9947.65 toks/s, output: 6329.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 92%|█████████▏| 1175/1280 [00:54<00:01, 102.32it/s, est. speed input: 10064.67 toks/s, output: 6490.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1190/1280 [00:54<00:00, 100.61it/s, est. speed input: 10162.94 toks/s, output: 6600.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1205/1280 [00:54<00:00, 99.53it/s, est. speed input: 10259.31 toks/s, output: 6685.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:54<00:00, 95.60it/s, est. speed input: 10349.38 toks/s, output: 6782.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:55<00:00, 97.43it/s, est. speed input: 10454.73 toks/s, output: 6902.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:55<00:00, 75.61it/s, est. speed input: 10520.92 toks/s, output: 7011.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:55<00:00, 59.20it/s, est. speed input: 10550.37 toks/s, output: 7066.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:55<00:00, 57.08it/s, est. speed input: 10597.20 toks/s, output: 7121.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [00:58<00:00, 12.59it/s, est. speed input: 10192.96 toks/s, output: 6905.60 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [00:58<00:00, 21.85it/s, est. speed input: 10192.96 toks/s, output: 6905.60 toks/s]
[36m(Runner pid=3309020)[0m [output] ['\nGiven that AD is the diameter of circle O and EA is tangent to circle O at point A, we can use the properties of circles and tangents to solve this problem. Since EA is tangent to the circle at A, angle OAE is a right angle (90 degrees), as tangents to a circle are perpendicular to the radius at the point of tangency.\n\nGiven that angle EAC = 120 degrees, we can find angle AOC, which is twice angle EAC because the central angle is twice the inscribed angle subtended by the same arc. Thus, angle AOC = 2 * 120 degrees = 240 degrees. However, since we are dealing with angles within a circle, we need to consider the supplementary angle to 240 degrees, which is 360 degrees - 240 degrees = 120 degrees. This is the angle at the center subtended by the same arc as angle EAC.\n\nNow, since AD is the diameter, angle ABD is a right angle (90 degrees) because any angle inscribed in a semicircle is a right angle. We can now find angle ABC by subtracting angle ABD from angle AOC. Therefore, angle ABC = angle AOC - angle ABD = 120 degrees - 90 degrees = 30 degrees.\n\nThus, the degree of angle ABC is 30 degrees.\n\n\n30 degrees\n']
[36m(Runner pid=3309020)[0m [score] [0.5]
[36m(Runner pid=3309020)[0m [ground_truth] ['60°']
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m Removed obsolete checkpoint: checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_55
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_70/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_70/actor/model_world_size_2_rank_0.pt.
[36m(WorkerDict pid=3319288)[0m [rank-0]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_70/actor/extra_state_world_size_2_rank_0.pt.
[36m(Runner pid=3309020)[0m Step 70
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.248
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.035
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.011
[36m(Runner pid=3309020)[0m ppo_kl: 2.291050938953987e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.02
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.02
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.68
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.68
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1027064
[36m(Runner pid=3309020)[0m balanced_min: 1025371
[36m(Runner pid=3309020)[0m max: 1050864
[36m(Runner pid=3309020)[0m mean: 1026217.5
[36m(Runner pid=3309020)[0m min: 1001571
[36m(Runner pid=3309020)[0m minmax_diff: 49293
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 112.276
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.226
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 77.559
[36m(Runner pid=3309020)[0m mfu_actor: 0.126
[36m(Runner pid=3309020)[0m throughput: 945.797
[36m(Runner pid=3309020)[0m time_per_step: 1085.03
[36m(Runner pid=3309020)[0m total_num_tokens: 2052435
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 668.0
[36m(Runner pid=3309020)[0m mean: 466.031
[36m(Runner pid=3309020)[0m min: 412.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 4592.0
[36m(Runner pid=3309020)[0m mean: 335.701
[36m(Runner pid=3309020)[0m min: 71.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.362
[36m(Runner pid=3309020)[0m format: 0.997
[36m(Runner pid=3309020)[0m overall: 0.68
[36m(Runner pid=3309020)[0m tag_reward: 0.998
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 8.21500661069482e-05
[36m(Runner pid=3309020)[0m gen: 0.155
[36m(Runner pid=3309020)[0m old: 0.043
[36m(Runner pid=3309020)[0m ref: 0.043
[36m(Runner pid=3309020)[0m reward: 0.008
[36m(Runner pid=3309020)[0m update_actor: 0.275
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.169
[36m(Runner pid=3309020)[0m gen: 133.193
[36m(Runner pid=3309020)[0m old: 88.19
[36m(Runner pid=3309020)[0m ref: 87.856
[36m(Runner pid=3309020)[0m reward: 6.6
[36m(Runner pid=3309020)[0m save_checkpoint: 32.34
[36m(Runner pid=3309020)[0m step: 1085.03
[36m(Runner pid=3309020)[0m update_actor: 563.432
[36m(Runner pid=3309020)[0m validation: 172.611
[36m(Runner pid=3309020)[0m val:
[36m(Runner pid=3309020)[0m accuracy_reward: 0.42
[36m(Runner pid=3309020)[0m format_reward: 0.977
[36m(Runner pid=3309020)[0m overall_reward: 0.7
[36m(Runner pid=3309020)[0m reward_score: 0.7
[36m(Runner pid=3309020)[0m tag_reward_reward: 0.983
[36m(Runner pid=3309020)[0m
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving model to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_70/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving checkpoint to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_70/actor/model_world_size_2_rank_1.pt.
[36m(WorkerDict pid=3319541)[0m [rank-1]: Saving extra_state to /home/huzhe/workspace/EasyR1/checkpoints/easy_r1/qwen2_5_vl_3b_GEOQA_8K_R1V/global_step_70/actor/extra_state_world_size_2_rank_1.pt.
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 71; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.51 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.09 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:06:52 [executor_base.py:219] It took 0.342307 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 60.01 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.43 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:08:25 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:06:52 [executor_base.py:219] It took 0.340798 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:08:25 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.82 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:08:25 [executor_base.py:208] It took 0.327078 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.76 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.84 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:08:25 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:08:26 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.84 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:08:26 [executor_base.py:208] It took 0.326209 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00034602373489178717, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0011678762966766953}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00020391400903463364, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.4367882311344147, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0012463824823498726}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.08830709755420685, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00035080689121969044, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.6670770049095154, 'actor/pg_clipfrac': 0.0005858230870217085, 'actor/ppo_kl': 0.00018321385141462088}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.2686467170715332, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001109790406189859}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.4934470057487488, 'actor/pg_clipfrac': 0.0028653296176344156, 'actor/ppo_kl': 0.000553201069124043}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.6650474667549133, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.0623103566467762, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.14265121519565582, 'actor/pg_clipfrac': 0.0012730744201689959, 'actor/ppo_kl': 0.000807377859018743}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.3804302215576172, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0002742470824159682, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0002443876874167472}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.337358683347702, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.6115303039550781, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0001552828907733783}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.2593385577201843, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.03444398194551468, 'actor/pg_clipfrac': 0.000697836687322706, 'actor/ppo_kl': 0.00017684568592812866}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.33464932441711426, 'actor/pg_clipfrac': 0.0016181230312213302, 'actor/ppo_kl': -0.001284897094592452}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.07623955607414246, 'actor/pg_clipfrac': 0.003223726525902748, 'actor/ppo_kl': 0.0007241087150759995}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00028053633286617696, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000401201075874269}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.7064480781555176, 'actor/pg_clipfrac': 0.0017391304718330503, 'actor/ppo_kl': -0.0019348443020135164}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0001825490326154977, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00025826177443377674}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.19850394129753113, 'actor/pg_clipfrac': 0.0029036004561930895, 'actor/ppo_kl': -0.0009508271468803287}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.2679818272590637, 'actor/pg_clipfrac': 0.0011574074160307646, 'actor/ppo_kl': -0.0017949541797861457}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.13414466381072998, 'actor/pg_clipfrac': 0.003267973894253373, 'actor/ppo_kl': 0.0003192216099705547}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.28814223408699036, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014035184867680073}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.00028388682403601706, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009329073363915086}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.10018562525510788, 'actor/pg_clipfrac': 0.0012143290368840098, 'actor/ppo_kl': -0.0007983348332345486}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.73128342628479, 'actor/pg_clipfrac': 0.0018501387676224113, 'actor/ppo_kl': -0.0003982424095738679}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': -0.41141968965530396, 'actor/pg_clipfrac': 0.0027045300230383873, 'actor/ppo_kl': -0.0019584756810218096}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.283528208732605, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005779018974862993}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.08009418100118637, 'actor/pg_clipfrac': 0.0007751937955617905, 'actor/ppo_kl': 0.002374720061197877}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.0613238662481308, 'actor/pg_clipfrac': 0.0009737098589539528, 'actor/ppo_kl': -9.51482797972858e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.11474189907312393, 'actor/pg_clipfrac': 0.0005714285653084517, 'actor/ppo_kl': -0.0002326747344341129}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.1172504723072052, 'actor/pg_clipfrac': 0.001805054140277207, 'actor/ppo_kl': -0.0024211311247199774}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.06538793444633484, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014236894203349948}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0002932194038294256, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0013424522476270795}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.0778321772813797, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00018000946147367358}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.35450735688209534, 'actor/pg_clipfrac': 0.0011074197245761752, 'actor/ppo_kl': -0.002921156119555235}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.1387491524219513, 'actor/pg_clipfrac': 0.0015455950051546097, 'actor/ppo_kl': 0.0008163643651641905}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0506717674434185, 'actor/pg_clipfrac': 0.0012262415839359164, 'actor/ppo_kl': 0.0009573849383741617}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.14680421352386475, 'actor/pg_clipfrac': 0.0021367522422224283, 'actor/ppo_kl': 0.000551256351172924}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.3535363972187042, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00019797914137598127}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.3947141468524933, 'actor/pg_clipfrac': 0.0005146680632606149, 'actor/ppo_kl': 0.0009198515326716006}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.22346289455890656, 'actor/pg_clipfrac': 0.0016064257360994816, 'actor/ppo_kl': 0.0005195908597670496}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.25414949655532837, 'actor/pg_clipfrac': 0.0008431703317910433, 'actor/ppo_kl': 0.0006985189975239336}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.34239694476127625, 'actor/pg_clipfrac': 0.004629629664123058, 'actor/ppo_kl': 3.9529801142634824e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.1186743751168251, 'actor/pg_clipfrac': 0.0020790020935237408, 'actor/ppo_kl': 0.00022423539485316724}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.2693675756454468, 'actor/pg_clipfrac': 0.00269905524328351, 'actor/ppo_kl': -0.0007972511230036616}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.23422405123710632, 'actor/pg_clipfrac': 0.002322880318388343, 'actor/ppo_kl': 0.001602602656930685}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00043068279046565294, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011399155482649803}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0005301767378114164, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00022036234440747648}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': -0.07711450010538101, 'actor/pg_clipfrac': 0.002044989727437496, 'actor/ppo_kl': -0.0010174417402595282}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.1204431876540184, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001350022736005485}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.00021677739277947694, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005042865523137152}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.35475143790245056, 'actor/pg_clipfrac': 0.0019361084559932351, 'actor/ppo_kl': 0.0005633620894514024}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.0002765152894426137, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00019420214812271297}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.6104624271392822, 'actor/pg_clipfrac': 0.000860585190821439, 'actor/ppo_kl': 0.00047538120998069644}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.07183317095041275, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010358162689954042}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': -0.20331987738609314, 'actor/pg_clipfrac': 0.0011668611550703645, 'actor/ppo_kl': 0.001247486099600792}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': -0.6001918911933899, 'actor/pg_clipfrac': 0.002288329415023327, 'actor/ppo_kl': 0.0005386034026741982}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.49774473905563354, 'actor/pg_clipfrac': 0.0007757951971143484, 'actor/ppo_kl': 0.0002288803516421467}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00021405362349469215, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0016825762577354908}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.11489468812942505, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.00041089882142841816}
[36m(Runner pid=3309020)[0m Step 71
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.247
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.031
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.014
[36m(Runner pid=3309020)[0m ppo_kl: 7.389998193687574e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.017
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.676
[36m(Runner pid=3309020)[0m min: 0.5
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.676
[36m(Runner pid=3309020)[0m min: 0.5
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1003643
[36m(Runner pid=3309020)[0m balanced_min: 1003643
[36m(Runner pid=3309020)[0m max: 1005608
[36m(Runner pid=3309020)[0m mean: 1003643.0
[36m(Runner pid=3309020)[0m min: 1001678
[36m(Runner pid=3309020)[0m minmax_diff: 3930
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 105.934
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.226
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 77.559
[36m(Runner pid=3309020)[0m mfu_actor: 0.123
[36m(Runner pid=3309020)[0m throughput: 1168.422
[36m(Runner pid=3309020)[0m time_per_step: 858.973
[36m(Runner pid=3309020)[0m total_num_tokens: 2007286
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 695.0
[36m(Runner pid=3309020)[0m mean: 465.076
[36m(Runner pid=3309020)[0m min: 409.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1262.0
[36m(Runner pid=3309020)[0m mean: 319.02
[36m(Runner pid=3309020)[0m min: 47.0
[36m(Runner pid=3309020)[0m reward:
[36m(Runner pid=3309020)[0m accuracy: 0.353
[36m(Runner pid=3309020)[0m format: 1.0
[36m(Runner pid=3309020)[0m overall: 0.676
[36m(Runner pid=3309020)[0m tag_reward: 1.0
[36m(Runner pid=3309020)[0m timing_per_token_ms:
[36m(Runner pid=3309020)[0m adv: 0.0
[36m(Runner pid=3309020)[0m gen: 0.132
[36m(Runner pid=3309020)[0m old: 0.044
[36m(Runner pid=3309020)[0m ref: 0.044
[36m(Runner pid=3309020)[0m reward: 0.007
[36m(Runner pid=3309020)[0m update_actor: 0.282
[36m(Runner pid=3309020)[0m timing_s:
[36m(Runner pid=3309020)[0m adv: 0.328
[36m(Runner pid=3309020)[0m gen: 108.133
[36m(Runner pid=3309020)[0m old: 88.439
[36m(Runner pid=3309020)[0m ref: 89.323
[36m(Runner pid=3309020)[0m reward: 6.078
[36m(Runner pid=3309020)[0m step: 858.973
[36m(Runner pid=3309020)[0m update_actor: 566.048
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m Step 72; batch size: 512
[36m(Runner pid=3309020)[0m
[36m(Runner pid=3309020)[0m ------------------
[36m(Runner pid=3309020)[0m [gen batch size]: 512
[36m(WorkerDict pid=3319288)[0m Before state_dict() in sharding manager: 8.63 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After state_dict() in sharding manager: 19.06 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:25<1:48:14, 5.09s/it, est. speed input: 102.87 toks/s, output: 22.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:30<56:29, 2.67s/it, est. speed input: 162.74 toks/s, output: 42.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:30<31:19, 1.49s/it, est. speed input: 234.59 toks/s, output: 64.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 20/1280 [00:31<20:32, 1.02it/s, est. speed input: 300.62 toks/s, output: 86.36 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:32<08:13, 2.52it/s, est. speed input: 505.37 toks/s, output: 152.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:32<06:43, 3.07it/s, est. speed input: 565.10 toks/s, output: 172.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▎ | 45/1280 [00:33<05:58, 3.45it/s, est. speed input: 615.58 toks/s, output: 190.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:36<07:14, 2.83it/s, est. speed input: 633.09 toks/s, output: 198.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:36<05:46, 3.53it/s, est. speed input: 685.68 toks/s, output: 218.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:37<03:30, 5.78it/s, est. speed input: 797.71 toks/s, output: 265.40 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:37<02:53, 6.99it/s, est. speed input: 855.48 toks/s, output: 286.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<01:49, 10.94it/s, est. speed input: 995.61 toks/s, output: 334.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:37<01:19, 15.06it/s, est. speed input: 1113.18 toks/s, output: 375.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:38<00:57, 20.45it/s, est. speed input: 1228.43 toks/s, output: 425.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 105/1280 [00:38<00:53, 22.08it/s, est. speed input: 1283.93 toks/s, output: 446.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▊ | 110/1280 [00:38<00:49, 23.74it/s, est. speed input: 1340.89 toks/s, output: 466.60 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:38<00:36, 31.48it/s, est. speed input: 1458.70 toks/s, output: 512.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:39<00:58, 19.66it/s, est. speed input: 1489.96 toks/s, output: 526.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|█ | 130/1280 [00:39<01:04, 17.82it/s, est. speed input: 1535.13 toks/s, output: 546.32 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:40<01:01, 18.63it/s, est. speed input: 1628.77 toks/s, output: 585.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:40<01:29, 12.70it/s, est. speed input: 1652.35 toks/s, output: 602.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:41<00:59, 18.72it/s, est. speed input: 1805.16 toks/s, output: 673.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:41<00:45, 24.58it/s, est. speed input: 1910.88 toks/s, output: 719.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:41<00:35, 31.08it/s, est. speed input: 2017.56 toks/s, output: 769.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:41<00:28, 37.95it/s, est. speed input: 2117.63 toks/s, output: 815.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 200/1280 [00:42<00:36, 29.66it/s, est. speed input: 2203.21 toks/s, output: 854.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:42<00:24, 42.44it/s, est. speed input: 2404.54 toks/s, output: 951.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:42<00:27, 37.64it/s, est. speed input: 2492.02 toks/s, output: 995.50 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 235/1280 [00:42<00:27, 38.35it/s, est. speed input: 2540.30 toks/s, output: 1019.29 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 245/1280 [00:43<00:27, 37.05it/s, est. speed input: 2632.20 toks/s, output: 1064.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:43<00:27, 38.05it/s, est. speed input: 2681.03 toks/s, output: 1094.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 255/1280 [00:43<00:38, 26.78it/s, est. speed input: 2711.02 toks/s, output: 1111.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:44<00:50, 20.22it/s, est. speed input: 2734.96 toks/s, output: 1132.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 265/1280 [00:44<00:46, 21.99it/s, est. speed input: 2777.81 toks/s, output: 1161.27 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██▏ | 275/1280 [00:44<00:33, 29.97it/s, est. speed input: 2874.40 toks/s, output: 1212.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 23%|██▎ | 290/1280 [00:44<00:22, 44.16it/s, est. speed input: 3015.31 toks/s, output: 1289.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:44<00:15, 61.10it/s, est. speed input: 3160.02 toks/s, output: 1363.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:44<00:14, 68.11it/s, est. speed input: 3256.21 toks/s, output: 1404.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:45<00:17, 56.12it/s, est. speed input: 3342.09 toks/s, output: 1454.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:45<00:14, 66.83it/s, est. speed input: 3488.37 toks/s, output: 1520.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:45<00:12, 73.00it/s, est. speed input: 3583.35 toks/s, output: 1574.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:45<00:14, 64.72it/s, est. speed input: 3672.19 toks/s, output: 1623.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:45<00:12, 71.66it/s, est. speed input: 3767.81 toks/s, output: 1679.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:46<00:24, 37.35it/s, est. speed input: 3827.42 toks/s, output: 1723.71 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:46<00:21, 40.76it/s, est. speed input: 3909.62 toks/s, output: 1767.76 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:46<00:19, 44.38it/s, est. speed input: 3993.40 toks/s, output: 1804.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 415/1280 [00:46<00:17, 48.55it/s, est. speed input: 4110.18 toks/s, output: 1863.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 425/1280 [00:46<00:15, 53.98it/s, est. speed input: 4195.33 toks/s, output: 1917.87 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 435/1280 [00:47<00:16, 51.67it/s, est. speed input: 4271.78 toks/s, output: 1968.19 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▍ | 445/1280 [00:47<00:14, 57.38it/s, est. speed input: 4361.66 toks/s, output: 2025.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 455/1280 [00:47<00:15, 54.26it/s, est. speed input: 4440.22 toks/s, output: 2076.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▋ | 465/1280 [00:47<00:13, 59.65it/s, est. speed input: 4521.43 toks/s, output: 2141.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 480/1280 [00:47<00:14, 56.91it/s, est. speed input: 4637.12 toks/s, output: 2231.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:48<00:09, 78.76it/s, est. speed input: 4808.88 toks/s, output: 2345.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|███▉ | 510/1280 [00:48<00:09, 79.84it/s, est. speed input: 4893.15 toks/s, output: 2392.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 520/1280 [00:48<00:09, 80.72it/s, est. speed input: 4979.27 toks/s, output: 2443.48 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████▏ | 530/1280 [00:48<00:14, 52.00it/s, est. speed input: 5034.82 toks/s, output: 2490.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 550/1280 [00:48<00:10, 67.94it/s, est. speed input: 5212.07 toks/s, output: 2588.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 44%|████▍ | 565/1280 [00:48<00:09, 77.16it/s, est. speed input: 5347.11 toks/s, output: 2680.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▌ | 580/1280 [00:49<00:08, 85.53it/s, est. speed input: 5478.08 toks/s, output: 2779.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▌ | 590/1280 [00:49<00:08, 83.02it/s, est. speed input: 5557.35 toks/s, output: 2818.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 47%|████▋ | 605/1280 [00:49<00:12, 55.79it/s, est. speed input: 5645.01 toks/s, output: 2887.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 615/1280 [00:49<00:11, 60.17it/s, est. speed input: 5722.52 toks/s, output: 2942.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:50<00:12, 53.40it/s, est. speed input: 5783.21 toks/s, output: 2980.47 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|█████ | 640/1280 [00:50<00:10, 63.46it/s, est. speed input: 5898.79 toks/s, output: 3071.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 660/1280 [00:50<00:07, 83.98it/s, est. speed input: 6065.64 toks/s, output: 3196.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 675/1280 [00:50<00:07, 75.89it/s, est. speed input: 6174.48 toks/s, output: 3258.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▎ | 685/1280 [00:50<00:07, 77.53it/s, est. speed input: 6250.07 toks/s, output: 3317.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:50<00:08, 68.05it/s, est. speed input: 6316.91 toks/s, output: 3366.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:51<00:07, 79.10it/s, est. speed input: 6467.72 toks/s, output: 3484.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 57%|█████▋ | 730/1280 [00:51<00:05, 92.24it/s, est. speed input: 6595.47 toks/s, output: 3587.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▊ | 750/1280 [00:51<00:04, 109.84it/s, est. speed input: 6750.70 toks/s, output: 3685.86 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 60%|█████▉ | 765/1280 [00:51<00:06, 83.97it/s, est. speed input: 6850.00 toks/s, output: 3743.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 780/1280 [00:51<00:06, 76.93it/s, est. speed input: 6952.71 toks/s, output: 3839.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 810/1280 [00:51<00:04, 113.24it/s, est. speed input: 7206.64 toks/s, output: 4003.37 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:52<00:03, 118.27it/s, est. speed input: 7332.05 toks/s, output: 4098.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 67%|██████▋ | 860/1280 [00:52<00:02, 140.54it/s, est. speed input: 7621.35 toks/s, output: 4332.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 69%|██████▉ | 880/1280 [00:52<00:03, 111.40it/s, est. speed input: 7756.76 toks/s, output: 4441.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 895/1280 [00:52<00:03, 97.80it/s, est. speed input: 7859.53 toks/s, output: 4515.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:53<00:04, 83.91it/s, est. speed input: 7952.28 toks/s, output: 4595.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 72%|███████▏ | 920/1280 [00:53<00:05, 65.54it/s, est. speed input: 7997.61 toks/s, output: 4645.78 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 935/1280 [00:53<00:04, 72.02it/s, est. speed input: 8106.70 toks/s, output: 4752.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 74%|███████▍ | 950/1280 [00:53<00:04, 77.70it/s, est. speed input: 8208.99 toks/s, output: 4841.46 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 980/1280 [00:53<00:02, 108.57it/s, est. speed input: 8448.81 toks/s, output: 5062.80 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 78%|███████▊ | 1000/1280 [00:53<00:02, 114.63it/s, est. speed input: 8606.24 toks/s, output: 5191.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 79%|███████▉ | 1015/1280 [00:54<00:02, 109.72it/s, est. speed input: 8710.45 toks/s, output: 5279.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 81%|████████ | 1035/1280 [00:54<00:02, 100.83it/s, est. speed input: 8836.48 toks/s, output: 5417.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1050/1280 [00:54<00:02, 98.39it/s, est. speed input: 8937.36 toks/s, output: 5520.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 83%|████████▎ | 1065/1280 [00:54<00:02, 93.36it/s, est. speed input: 9033.60 toks/s, output: 5636.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:54<00:01, 106.37it/s, est. speed input: 9205.94 toks/s, output: 5829.64 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:55<00:01, 102.26it/s, est. speed input: 9306.22 toks/s, output: 5943.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:55<00:01, 82.68it/s, est. speed input: 9384.06 toks/s, output: 6045.13 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:55<00:01, 91.80it/s, est. speed input: 9491.12 toks/s, output: 6157.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:55<00:01, 96.60it/s, est. speed input: 9593.53 toks/s, output: 6250.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1165/1280 [00:55<00:01, 92.49it/s, est. speed input: 9685.30 toks/s, output: 6364.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:55<00:00, 98.82it/s, est. speed input: 9821.35 toks/s, output: 6509.43 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:56<00:00, 80.92it/s, est. speed input: 9895.08 toks/s, output: 6609.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1215/1280 [00:56<00:00, 82.31it/s, est. speed input: 9986.47 toks/s, output: 6724.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1225/1280 [00:56<00:01, 45.23it/s, est. speed input: 9961.60 toks/s, output: 6754.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▋| 1235/1280 [00:57<00:01, 32.13it/s, est. speed input: 9938.04 toks/s, output: 6769.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1245/1280 [00:57<00:00, 35.85it/s, est. speed input: 9990.11 toks/s, output: 6860.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1255/1280 [00:58<00:01, 21.28it/s, est. speed input: 9897.71 toks/s, output: 6833.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:58<00:00, 21.47it/s, est. speed input: 9897.65 toks/s, output: 6855.38 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1265/1280 [00:59<00:00, 23.34it/s, est. speed input: 9913.10 toks/s, output: 6889.39 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [00:59<00:00, 18.55it/s, est. speed input: 9872.08 toks/s, output: 6864.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|█████████▉| 1275/1280 [01:00<00:00, 10.87it/s, est. speed input: 9732.58 toks/s, output: 6801.99 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:02<00:00, 5.56it/s, est. speed input: 9423.63 toks/s, output: 6619.02 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:02<00:00, 20.34it/s, est. speed input: 9423.63 toks/s, output: 6619.02 toks/s]
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:21:13 [executor_base.py:219] It took 0.339589 seconds to wake up.
[36m(WorkerDict pid=3319288)[0m After sync model weights in sharding manager: 59.98 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m After del state_dict and empty_cache in sharding manager: 49.55 GB / 79.14 GB.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:22:40 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:21:13 [executor_base.py:219] It took 0.339977 seconds to wake up.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:22:41 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.83 GiB memory is still in use.
[36m(WorkerDict pid=3319541)[0m INFO 04-08 15:22:41 [executor_base.py:208] It took 0.327910 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m Before vllm offload in sharding manager: 47.73 GB / 79.14 GB.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:22:46 [block_pool.py:255] Successfully reset prefix cache
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:22:46 [gpu_worker.py:81] Sleep mode freed 40.92 GiB memory, 6.81 GiB memory is still in use.
[36m(WorkerDict pid=3319288)[0m INFO 04-08 15:22:46 [executor_base.py:208] It took 0.325471 seconds to fall asleep.
[36m(WorkerDict pid=3319288)[0m After vllm offload in sharding manager: 6.81 GB / 79.14 GB.
[36m(Runner pid=3309020)[0m gen_batch_output: 2560
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(Runner pid=3309020)[0m Failed to parse gold solution: $$
[36m(WorkerDict pid=3319288)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319541)[0m Number of mini_batches: 4, global_batch_size_per_device: 320
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00035076134372502565, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -8.999450074043125e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.014684412628412247, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.00021744232799392194, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0017206856282427907}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': -0.0972491130232811, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000383650854928419}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.0003930935636162758, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0006217749323695898}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': -0.0417451485991478, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.0002992440713569522, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0011375726899132133}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.26179006695747375, 'actor/pg_clipfrac': 0.0005970149068161845, 'actor/ppo_kl': -0.0006772362976334989}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.020612023770809174, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0006817998364567757}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.16776208579540253, 'actor/pg_clipfrac': 0.0009354536887258291, 'actor/ppo_kl': -4.3439016735646874e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': -0.22055141627788544, 'actor/pg_clipfrac': 0.0015306122368201613, 'actor/ppo_kl': -0.0006754914065822959}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.00016364656039513648, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00046025498886592686}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0004508399579208344, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.000391782057704404, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009460638393647969}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': -0.1583714783191681, 'actor/pg_clipfrac': 0.001145475427620113, 'actor/ppo_kl': 0.0008644123445264995}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.41611626744270325, 'actor/pg_clipfrac': 0.0023094688076525927, 'actor/ppo_kl': -7.479185296688229e-05}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.33787739276885986, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0007402177434414625}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.25446832180023193, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00163546041585505}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.19300556182861328, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0009170089033432305}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.00033582834294065833, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.000676470750477165}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.00039710840792395175, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0010437372839078307}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.09976562112569809, 'actor/pg_clipfrac': 0.002106741536408663, 'actor/ppo_kl': 9.401460374647286e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.4279480576515198, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0012479815632104874}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.00045318988850340247, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005830295849591494}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': 0.3450310528278351, 'actor/pg_clipfrac': 0.0019342360319569707, 'actor/ppo_kl': 0.0027447540778666735}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.1493065506219864, 'actor/pg_clipfrac': 0.0005633803084492683, 'actor/ppo_kl': 0.00030435266671702266}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.9400297999382019, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.001983333146199584}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.04500539228320122, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0037102734204381704}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.0006107747321948409, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0014368277043104172}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003764723951462656, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005072045605629683}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.00036047480534762144, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 6.06654975854326e-06}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.17769403755664825, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0013977106427773833}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': -0.33361566066741943, 'actor/pg_clipfrac': 0.002649883972480893, 'actor/ppo_kl': 0.0008114925003610551}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': 0.00023652140225749463, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0002687668602447957}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': 0.2238287478685379, 'actor/pg_clipfrac': 0.0006973500712774694, 'actor/ppo_kl': 0.00019313301891088486}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.0004111572343390435, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0004264439339749515}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.22493796050548553, 'actor/pg_clipfrac': 0.004823151044547558, 'actor/ppo_kl': 0.001829122775234282}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.13211452960968018, 'actor/pg_clipfrac': 0.002057613106444478, 'actor/ppo_kl': -1.709941898297984e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': -0.13001945614814758, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.001006138976663351}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': 0.0002953883958980441, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00023261659953277558}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.06581853330135345, 'actor/pg_clipfrac': 0.0010204081190750003, 'actor/ppo_kl': 0.001228334498591721}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': 0.10986123234033585, 'actor/pg_clipfrac': 0.0017667844658717513, 'actor/ppo_kl': 0.00010702719737309963}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.14875908195972443, 'actor/pg_clipfrac': 0.0008084074361249804, 'actor/ppo_kl': -0.0002534198574721813}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 7.527459092671052e-05, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005459638778120279}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.00043709788587875664, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 5.9575511841103435e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.4122619926929474, 'actor/pg_clipfrac': 0.001215066877193749, 'actor/ppo_kl': 0.0011202833848074079}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.0005143781891092658, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0018397415988147259}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': -0.1738835722208023, 'actor/pg_clipfrac': 0.0034602077212184668, 'actor/ppo_kl': -0.00018355096108280122}
[36m(WorkerDict pid=3319288)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 0, batch_metrics: {'actor/pg_loss': 0.00027517214766703546, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.000487656012410298}
[36m(WorkerDict pid=3319288)[0m - Current Step 5, batch_metrics: {'actor/pg_loss': -0.1848837286233902, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.002847281750291586}
[36m(WorkerDict pid=3319541)[0m Update policy with 80 micro batches
[36m(WorkerDict pid=3319288)[0m - Current Step 10, batch_metrics: {'actor/pg_loss': -0.2908196449279785, 'actor/pg_clipfrac': 0.0024009603075683117, 'actor/ppo_kl': -0.00032439353526569903}
[36m(WorkerDict pid=3319288)[0m - Current Step 15, batch_metrics: {'actor/pg_loss': 0.000419548770878464, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -8.216621063183993e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 20, batch_metrics: {'actor/pg_loss': 0.000338273704983294, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0024472635705024004}
[36m(WorkerDict pid=3319288)[0m - Current Step 25, batch_metrics: {'actor/pg_loss': 0.3601624667644501, 'actor/pg_clipfrac': 0.0009891196386888623, 'actor/ppo_kl': 0.0005496654193848372}
[36m(WorkerDict pid=3319288)[0m - Current Step 30, batch_metrics: {'actor/pg_loss': 0.13887055218219757, 'actor/pg_clipfrac': 0.0006397952674888074, 'actor/ppo_kl': 4.721877849078737e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 35, batch_metrics: {'actor/pg_loss': -0.24454663693904877, 'actor/pg_clipfrac': 0.0022522523067891598, 'actor/ppo_kl': -7.45670186006464e-05}
[36m(WorkerDict pid=3319288)[0m - Current Step 40, batch_metrics: {'actor/pg_loss': -0.005134013947099447, 'actor/pg_clipfrac': 0.0011286681983619928, 'actor/ppo_kl': -0.0015798562671989202}
[36m(WorkerDict pid=3319288)[0m - Current Step 45, batch_metrics: {'actor/pg_loss': -0.4091501533985138, 'actor/pg_clipfrac': 0.006167401093989611, 'actor/ppo_kl': -0.0006516544963233173}
[36m(WorkerDict pid=3319288)[0m - Current Step 50, batch_metrics: {'actor/pg_loss': 0.30455082654953003, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005862017278559506}
[36m(WorkerDict pid=3319288)[0m - Current Step 55, batch_metrics: {'actor/pg_loss': 0.1374669224023819, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0005012932233512402}
[36m(WorkerDict pid=3319288)[0m - Current Step 60, batch_metrics: {'actor/pg_loss': 0.18984036147594452, 'actor/pg_clipfrac': 0.002515723230317235, 'actor/ppo_kl': -0.0006360425613820553}
[36m(WorkerDict pid=3319288)[0m - Current Step 65, batch_metrics: {'actor/pg_loss': 0.0003260451485402882, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.00045019169920124114}
[36m(WorkerDict pid=3319288)[0m - Current Step 70, batch_metrics: {'actor/pg_loss': 0.31773146986961365, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': -0.0003264836850576103}
[36m(WorkerDict pid=3319288)[0m - Current Step 75, batch_metrics: {'actor/pg_loss': 0.15951590240001678, 'actor/pg_clipfrac': 0.0, 'actor/ppo_kl': 0.0005060756229795516}
[36m(Runner pid=3309020)[0m Step 72
[36m(Runner pid=3309020)[0m actor:
[36m(Runner pid=3309020)[0m grad_norm: 0.345
[36m(Runner pid=3309020)[0m kl_coef: 0.01
[36m(Runner pid=3309020)[0m kl_loss: 0.039
[36m(Runner pid=3309020)[0m lr: 1.0e-06
[36m(Runner pid=3309020)[0m pg_clipfrac: 0.001
[36m(Runner pid=3309020)[0m pg_loss: 0.008
[36m(Runner pid=3309020)[0m ppo_kl: 5.2654363444337716e-05
[36m(Runner pid=3309020)[0m critic:
[36m(Runner pid=3309020)[0m advantages:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m returns:
[36m(Runner pid=3309020)[0m max: 1.789
[36m(Runner pid=3309020)[0m mean: -0.015
[36m(Runner pid=3309020)[0m min: -1.789
[36m(Runner pid=3309020)[0m rewards:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.689
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m score:
[36m(Runner pid=3309020)[0m max: 1.0
[36m(Runner pid=3309020)[0m mean: 0.689
[36m(Runner pid=3309020)[0m min: 0.05
[36m(Runner pid=3309020)[0m global_seqlen:
[36m(Runner pid=3309020)[0m balanced_max: 1000212
[36m(Runner pid=3309020)[0m balanced_min: 1000211
[36m(Runner pid=3309020)[0m max: 1009336
[36m(Runner pid=3309020)[0m mean: 1000211.5
[36m(Runner pid=3309020)[0m min: 991087
[36m(Runner pid=3309020)[0m minmax_diff: 18249
[36m(Runner pid=3309020)[0m perf:
[36m(Runner pid=3309020)[0m cpu_memory_used_gb: 110.904
[36m(Runner pid=3309020)[0m max_memory_allocated_gb: 40.226
[36m(Runner pid=3309020)[0m max_memory_reserved_gb: 77.559
[36m(Runner pid=3309020)[0m mfu_actor: 0.123
[36m(Runner pid=3309020)[0m throughput: 1168.171
[36m(Runner pid=3309020)[0m time_per_step: 856.22
[36m(Runner pid=3309020)[0m total_num_tokens: 2000423
[36m(Runner pid=3309020)[0m prompt_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 679.0
[36m(Runner pid=3309020)[0m mean: 464.568
[36m(Runner pid=3309020)[0m min: 410.0
[36m(Runner pid=3309020)[0m response_length:
[36m(Runner pid=3309020)[0m clip_ratio: 0.0
[36m(Runner pid=3309020)[0m max: 1586.0
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 0/1280 [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 0%| | 5/1280 [00:21<1:32:59, 4.38s/it, est. speed input: 102.60 toks/s, output: 24.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 10/1280 [00:26<49:40, 2.35s/it, est. speed input: 167.85 toks/s, output: 42.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 1%| | 15/1280 [00:26<27:16, 1.29s/it, est. speed input: 251.39 toks/s, output: 69.41 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 25/1280 [00:28<13:32, 1.54it/s, est. speed input: 402.55 toks/s, output: 119.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 2%|▏ | 30/1280 [00:31<13:20, 1.56it/s, est. speed input: 437.41 toks/s, output: 130.30 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 35/1280 [00:31<09:41, 2.14it/s, est. speed input: 509.36 toks/s, output: 154.52 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 3%|▎ | 40/1280 [00:31<07:17, 2.84it/s, est. speed input: 576.59 toks/s, output: 178.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 50/1280 [00:32<04:30, 4.55it/s, est. speed input: 707.15 toks/s, output: 223.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 4%|▍ | 55/1280 [00:34<05:24, 3.77it/s, est. speed input: 737.21 toks/s, output: 233.90 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 65/1280 [00:36<04:32, 4.46it/s, est. speed input: 830.61 toks/s, output: 268.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 5%|▌ | 70/1280 [00:36<03:54, 5.16it/s, est. speed input: 884.95 toks/s, output: 286.23 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 6%|▋ | 80/1280 [00:37<02:57, 6.75it/s, est. speed input: 991.62 toks/s, output: 320.28 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 7%|▋ | 90/1280 [00:38<02:19, 8.56it/s, est. speed input: 1101.08 toks/s, output: 368.73 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 8%|▊ | 100/1280 [00:38<01:46, 11.08it/s, est. speed input: 1210.01 toks/s, output: 415.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 115/1280 [00:39<01:18, 14.91it/s, est. speed input: 1371.33 toks/s, output: 482.31 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 9%|▉ | 120/1280 [00:39<01:10, 16.43it/s, est. speed input: 1421.00 toks/s, output: 505.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 10%|▉ | 125/1280 [00:39<01:13, 15.80it/s, est. speed input: 1469.26 toks/s, output: 520.42 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 135/1280 [00:39<01:01, 18.71it/s, est. speed input: 1568.21 toks/s, output: 567.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█ | 140/1280 [00:40<00:58, 19.53it/s, est. speed input: 1616.05 toks/s, output: 587.04 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 11%|█▏ | 145/1280 [00:40<00:52, 21.73it/s, est. speed input: 1669.92 toks/s, output: 611.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 150/1280 [00:40<01:09, 16.19it/s, est. speed input: 1701.13 toks/s, output: 629.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▏ | 155/1280 [00:40<00:59, 18.89it/s, est. speed input: 1751.75 toks/s, output: 651.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 12%|█▎ | 160/1280 [00:41<01:03, 17.61it/s, est. speed input: 1793.44 toks/s, output: 670.25 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 165/1280 [00:41<01:14, 14.94it/s, est. speed input: 1825.39 toks/s, output: 687.96 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 13%|█▎ | 170/1280 [00:41<01:05, 16.91it/s, est. speed input: 1871.53 toks/s, output: 713.94 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 180/1280 [00:42<00:41, 26.22it/s, est. speed input: 1970.58 toks/s, output: 760.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 14%|█▍ | 185/1280 [00:42<00:38, 28.74it/s, est. speed input: 2019.19 toks/s, output: 784.34 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▍ | 190/1280 [00:42<00:48, 22.44it/s, est. speed input: 2056.80 toks/s, output: 801.97 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 15%|█▌ | 195/1280 [00:43<01:09, 15.51it/s, est. speed input: 2081.25 toks/s, output: 814.11 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▌ | 205/1280 [00:43<00:47, 22.61it/s, est. speed input: 2178.30 toks/s, output: 857.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 16%|█▋ | 210/1280 [00:43<00:42, 25.41it/s, est. speed input: 2220.50 toks/s, output: 882.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 17%|█▋ | 220/1280 [00:43<00:29, 35.46it/s, est. speed input: 2319.34 toks/s, output: 937.45 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 18%|█▊ | 230/1280 [00:43<00:25, 41.16it/s, est. speed input: 2420.63 toks/s, output: 996.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 19%|█▉ | 240/1280 [00:43<00:24, 41.83it/s, est. speed input: 2509.00 toks/s, output: 1039.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|█▉ | 250/1280 [00:44<00:26, 39.42it/s, est. speed input: 2599.95 toks/s, output: 1094.77 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 20%|██ | 260/1280 [00:44<00:21, 48.07it/s, est. speed input: 2713.40 toks/s, output: 1145.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 21%|██ | 270/1280 [00:44<00:19, 51.37it/s, est. speed input: 2803.73 toks/s, output: 1186.57 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 280/1280 [00:45<00:29, 33.84it/s, est. speed input: 2868.69 toks/s, output: 1201.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 22%|██▏ | 285/1280 [00:45<00:33, 29.93it/s, est. speed input: 2899.50 toks/s, output: 1217.82 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 24%|██▍ | 305/1280 [00:45<00:21, 46.37it/s, est. speed input: 3092.68 toks/s, output: 1311.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▍ | 315/1280 [00:45<00:19, 50.40it/s, est. speed input: 3188.06 toks/s, output: 1367.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 25%|██▌ | 325/1280 [00:46<00:24, 39.60it/s, est. speed input: 3258.46 toks/s, output: 1417.51 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 340/1280 [00:46<00:18, 51.16it/s, est. speed input: 3396.21 toks/s, output: 1503.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 27%|██▋ | 350/1280 [00:46<00:16, 54.84it/s, est. speed input: 3483.65 toks/s, output: 1559.63 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 28%|██▊ | 360/1280 [00:46<00:15, 58.11it/s, est. speed input: 3572.22 toks/s, output: 1615.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 29%|██▉ | 370/1280 [00:46<00:17, 53.38it/s, est. speed input: 3653.49 toks/s, output: 1652.17 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|██▉ | 380/1280 [00:46<00:18, 47.95it/s, est. speed input: 3758.01 toks/s, output: 1694.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 30%|███ | 390/1280 [00:47<00:16, 53.80it/s, est. speed input: 3849.65 toks/s, output: 1743.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 31%|███▏ | 400/1280 [00:47<00:19, 45.73it/s, est. speed input: 3923.56 toks/s, output: 1775.55 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 32%|███▏ | 410/1280 [00:47<00:16, 52.04it/s, est. speed input: 4012.36 toks/s, output: 1832.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 33%|███▎ | 420/1280 [00:47<00:20, 42.76it/s, est. speed input: 4083.19 toks/s, output: 1887.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▎ | 430/1280 [00:47<00:17, 49.90it/s, est. speed input: 4167.18 toks/s, output: 1928.08 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 34%|███▍ | 440/1280 [00:48<00:17, 47.18it/s, est. speed input: 4246.27 toks/s, output: 1975.54 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 35%|███▌ | 450/1280 [00:48<00:17, 48.23it/s, est. speed input: 4325.50 toks/s, output: 2024.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 36%|███▌ | 460/1280 [00:48<00:16, 48.95it/s, est. speed input: 4401.67 toks/s, output: 2065.61 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 37%|███▋ | 475/1280 [00:48<00:12, 64.73it/s, est. speed input: 4532.36 toks/s, output: 2152.84 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 38%|███▊ | 485/1280 [00:48<00:14, 55.51it/s, est. speed input: 4605.89 toks/s, output: 2198.33 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 39%|███▉ | 500/1280 [00:49<00:11, 67.61it/s, est. speed input: 4734.10 toks/s, output: 2291.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 40%|████ | 515/1280 [00:49<00:09, 77.84it/s, est. speed input: 4865.79 toks/s, output: 2369.15 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 41%|████ | 525/1280 [00:49<00:09, 76.74it/s, est. speed input: 4944.23 toks/s, output: 2423.72 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 42%|████▏ | 535/1280 [00:49<00:13, 54.86it/s, est. speed input: 5008.50 toks/s, output: 2461.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 43%|████▎ | 545/1280 [00:49<00:13, 56.49it/s, est. speed input: 5086.37 toks/s, output: 2503.88 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 45%|████▍ | 570/1280 [00:50<00:08, 86.94it/s, est. speed input: 5307.56 toks/s, output: 2650.59 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 46%|████▋ | 595/1280 [00:50<00:06, 113.31it/s, est. speed input: 5522.98 toks/s, output: 2792.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 48%|████▊ | 610/1280 [00:50<00:06, 108.90it/s, est. speed input: 5646.59 toks/s, output: 2884.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 49%|████▉ | 625/1280 [00:50<00:08, 79.24it/s, est. speed input: 5750.75 toks/s, output: 2959.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 50%|████▉ | 635/1280 [00:50<00:08, 80.40it/s, est. speed input: 5849.02 toks/s, output: 3019.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 51%|█████ | 650/1280 [00:50<00:06, 90.93it/s, est. speed input: 5967.16 toks/s, output: 3116.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 52%|█████▏ | 665/1280 [00:51<00:06, 89.90it/s, est. speed input: 6094.55 toks/s, output: 3211.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 53%|█████▎ | 680/1280 [00:51<00:09, 63.73it/s, est. speed input: 6186.16 toks/s, output: 3277.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 54%|█████▍ | 695/1280 [00:51<00:08, 72.23it/s, est. speed input: 6303.74 toks/s, output: 3353.74 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 55%|█████▌ | 705/1280 [00:51<00:10, 57.07it/s, est. speed input: 6353.29 toks/s, output: 3398.07 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 56%|█████▌ | 715/1280 [00:52<00:09, 58.64it/s, est. speed input: 6426.15 toks/s, output: 3451.02 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 58%|█████▊ | 745/1280 [00:52<00:06, 88.88it/s, est. speed input: 6669.16 toks/s, output: 3642.14 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 59%|█████▉ | 760/1280 [00:52<00:05, 98.46it/s, est. speed input: 6790.12 toks/s, output: 3723.66 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 61%|██████ | 775/1280 [00:52<00:04, 107.49it/s, est. speed input: 6911.33 toks/s, output: 3826.44 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 63%|██████▎ | 805/1280 [00:52<00:03, 149.15it/s, est. speed input: 7154.18 toks/s, output: 4017.89 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 64%|██████▍ | 825/1280 [00:52<00:03, 135.73it/s, est. speed input: 7296.08 toks/s, output: 4124.68 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 66%|██████▌ | 845/1280 [00:52<00:04, 107.76it/s, est. speed input: 7437.45 toks/s, output: 4235.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 68%|██████▊ | 865/1280 [00:53<00:04, 97.24it/s, est. speed input: 7573.02 toks/s, output: 4358.35 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 70%|██████▉ | 890/1280 [00:53<00:03, 103.99it/s, est. speed input: 7753.82 toks/s, output: 4515.21 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 71%|███████ | 910/1280 [00:53<00:03, 118.62it/s, est. speed input: 7905.84 toks/s, output: 4650.70 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 73%|███████▎ | 930/1280 [00:53<00:03, 103.53it/s, est. speed input: 8041.20 toks/s, output: 4783.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 75%|███████▍ | 955/1280 [00:53<00:02, 123.72it/s, est. speed input: 8247.96 toks/s, output: 4963.75 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 76%|███████▌ | 970/1280 [00:54<00:02, 104.22it/s, est. speed input: 8341.29 toks/s, output: 5049.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 77%|███████▋ | 990/1280 [00:54<00:02, 118.59it/s, est. speed input: 8495.14 toks/s, output: 5188.22 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 80%|████████ | 1025/1280 [00:54<00:01, 161.95it/s, est. speed input: 8765.28 toks/s, output: 5401.18 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 82%|████████▏ | 1045/1280 [00:54<00:01, 121.24it/s, est. speed input: 8888.95 toks/s, output: 5533.01 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 84%|████████▎ | 1070/1280 [00:54<00:01, 136.90it/s, est. speed input: 9080.40 toks/s, output: 5740.93 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 85%|████████▌ | 1090/1280 [00:55<00:01, 107.49it/s, est. speed input: 9205.58 toks/s, output: 5882.03 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 86%|████████▋ | 1105/1280 [00:55<00:01, 113.63it/s, est. speed input: 9311.60 toks/s, output: 6003.67 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 88%|████████▊ | 1120/1280 [00:55<00:01, 105.78it/s, est. speed input: 9410.21 toks/s, output: 6104.12 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 89%|████████▊ | 1135/1280 [00:55<00:01, 77.69it/s, est. speed input: 9482.19 toks/s, output: 6175.95 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 90%|████████▉ | 1150/1280 [00:55<00:01, 71.54it/s, est. speed input: 9566.86 toks/s, output: 6285.69 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████ | 1160/1280 [00:56<00:02, 55.07it/s, est. speed input: 9593.93 toks/s, output: 6340.79 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 91%|█████████▏| 1170/1280 [00:56<00:02, 53.50it/s, est. speed input: 9638.62 toks/s, output: 6402.06 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 93%|█████████▎| 1185/1280 [00:56<00:01, 56.06it/s, est. speed input: 9717.91 toks/s, output: 6502.09 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 94%|█████████▍| 1200/1280 [00:56<00:01, 66.94it/s, est. speed input: 9820.48 toks/s, output: 6584.62 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▍| 1210/1280 [00:57<00:01, 66.46it/s, est. speed input: 9873.41 toks/s, output: 6649.91 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 95%|█████████▌| 1220/1280 [00:57<00:01, 59.52it/s, est. speed input: 9931.57 toks/s, output: 6737.26 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 96%|█████████▌| 1230/1280 [00:57<00:01, 48.35it/s, est. speed input: 9958.61 toks/s, output: 6779.10 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 97%|█████████▋| 1240/1280 [00:57<00:00, 42.87it/s, est. speed input: 9984.88 toks/s, output: 6853.85 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1250/1280 [00:58<00:00, 35.17it/s, est. speed input: 10002.49 toks/s, output: 6882.53 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 98%|█████████▊| 1260/1280 [00:59<00:01, 17.88it/s, est. speed input: 9872.16 toks/s, output: 6848.05 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 99%|█████████▉| 1270/1280 [01:00<00:00, 17.61it/s, est. speed input: 9859.03 toks/s, output: 6883.92 toks/s]
[36m(WorkerDict pid=3319288)[0m
Processed prompts: 100%|██████████| 1280/1280 [01:02<00:00, 9.83it/s, est. speed input: 9593.35 toks/s, output: 6768.35 toks/s]
Processed prompts: 100%|██████████| 1280/1280 [01:02<00:00, 20.58it/s, est. speed input: 9593.35 toks/s, output: 6768.35 toks/s]