[2025-04-17 14:51:53,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-04-17 14:51:53,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-04-17 14:51:53,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-04-17 14:51:53,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) Global rank 3, Local Rank: 3 initiated Global rank 2, Local Rank: 2 initiated [2025-04-17 14:51:55,503] [INFO] [comm.py:652:init_distributed] cdb=None [2025-04-17 14:51:55,507] [INFO] [comm.py:652:init_distributed] cdb=None Global rank 1, Local Rank: 1 initiated [2025-04-17 14:51:55,518] [INFO] [comm.py:652:init_distributed] cdb=None Global rank 0, Local Rank: 0 initiated [2025-04-17 14:51:55,550] [INFO] [comm.py:652:init_distributed] cdb=None [2025-04-17 14:51:55,550] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl GPU 0 - Using device: cuda GPU 3 - Using device: cuda GPU 2 - Using device: cuda GPU 1 - Using device: cuda Wandb initialized Rank 0: Loading vision tower: google/siglip-so400m-patch14-384 using lm_loss_weight: 0.2, video_loss_weight: 1, info_loss_weight: 0.5, ref_loss_weight: 6.0, uncertainty_loss_weight: 0.3, and tv_loss_weight: 0.05 for training creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|gate_proj)$', lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|gate_proj)$', lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|gate_proj)$', lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|gate_proj)$', lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) trainable params: 1,132,472,320 || all params: 8,600,842,784 || trainable%: 13.16699244993431 [Rank 2] Distributed initialized? True [Rank 2] Backend: nccl load datasets/coin/videos_metadata.json... trainable params: 1,132,472,320 || all params: 8,600,842,784 || trainable%: 13.16699244993431 [Rank 1] Distributed initialized? True [Rank 1] Backend: nccl load datasets/coin/videos_metadata.json... trainable params: 1,132,472,320 || all params: 8,600,842,784 || trainable%: 13.16699244993431 [Rank 3] Distributed initialized? True [Rank 3] Backend: nccl load datasets/coin/videos_metadata.json... trainable params: 1,132,472,320 || all params: 8,600,842,784 || trainable%: 13.16699244993431 ('base_model.model.model.image_newline', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.embed_tokens.weight', torch.Size([152064, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.0.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.0.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.1.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.1.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.2.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.2.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.3.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.3.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.4.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.4.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.5.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.5.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.6.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.6.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.7.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.7.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.8.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.8.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.9.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.9.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.10.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.10.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.11.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.11.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.12.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.12.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.13.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.13.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.14.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.14.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.15.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.15.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.16.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.16.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.17.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.17.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.18.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.18.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.19.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.19.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.20.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.20.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.21.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.21.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.22.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.22.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.23.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.23.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.24.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.24.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.24.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.25.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.25.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.25.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.26.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.26.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.26.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.27.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.27.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.27.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.norm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', torch.Size([1152, 3, 14, 14]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', torch.Size([729, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.post_layernorm.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.post_layernorm.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.mm_projector.original_module.0.weight', torch.Size([3584, 1152]), torch.bfloat16, True) ('base_model.model.model.mm_projector.original_module.0.bias', torch.Size([3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.original_module.2.weight', torch.Size([3584, 3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.original_module.2.bias', torch.Size([3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.modules_to_save.default.0.weight', torch.Size([3584, 1152]), torch.bfloat16, True) ('base_model.model.model.mm_projector.modules_to_save.default.0.bias', torch.Size([3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.modules_to_save.default.2.weight', torch.Size([3584, 3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.modules_to_save.default.2.bias', torch.Size([3584]), torch.bfloat16, True) ('base_model.model.lm_head.original_module.weight', torch.Size([152064, 3584]), torch.bfloat16, True) ('base_model.model.lm_head.modules_to_save.default.weight', torch.Size([152064, 3584]), torch.bfloat16, True) ('base_model.model.informative_head.original_module.weight', torch.Size([2, 3584]), torch.bfloat16, True) ('base_model.model.informative_head.modules_to_save.default.weight', torch.Size([2, 3584]), torch.bfloat16, True) ('base_model.model.relevance_head.original_module.weight', torch.Size([1, 3584]), torch.bfloat16, True) ('base_model.model.relevance_head.modules_to_save.default.weight', torch.Size([1, 3584]), torch.bfloat16, True) ('base_model.model.uncertainty_head.original_module.weight', torch.Size([1, 3584]), torch.bfloat16, True) ('base_model.model.uncertainty_head.modules_to_save.default.weight', torch.Size([1, 3584]), torch.bfloat16, True) [Rank 0] Distributed initialized? True [Rank 0] Backend: nccl load datasets/coin/videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHey assistant, do you know the current video content? Reply me concisely.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30872, 30923), range(31662, 31695)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nDo concise real-time narration.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30830, 30881), range(31620, 31653)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat can you tell me about? Be concise.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30838, 30889), range(31628, 31661)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nContinuously answer what you observed with simple text.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30854, 30905), range(31644, 31677)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset MAGQAStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] Dataset MAGQAStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] Dataset MAGQAStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... load datasets/shot2story/release_134k_videos_metadata.json... load datasets/shot2story/release_134k_videos_metadata.json... Dataset MAGQAStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nUse simple text to explain what is shown in front of me.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1700, 1807), range(5290, 5534), range(9017, 9359), range(13871, 14272)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] Dataset DenseVideoCaptioningStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease simply describe what do you see.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] Dataset DenseVideoCaptioningStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHey assistant, do you know the current video content? Reply me concisely.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1717, 1824), range(5307, 5551), range(9034, 9376), range(13888, 14289)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] load datasets/hisum/videos_metadata.json... load datasets/hisum/videos_metadata.json... load datasets/hisum/videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHelp me to illustrate my view in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] load datasets/hisum/videos_metadata.json... Mr. HiSum loaded 10388 out of 12000 videos Mr. HiSum loaded 10388 out of 12000 videos Mr. HiSum loaded 10388 out of 12000 videos Mr. HiSum loaded 10388 out of 12000 videos Dataset HiSumDataset has 10298 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nCustomized Yamaha MT-01 akrapovic HD (Without DB-killer)<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>', torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nAt what timestamp can I find information about 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)' in the video?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] Starting Training! Resuming from checkpoint: outputs/aha/checkpoint-2825 Starting Training! Resuming from checkpoint: outputs/aha/checkpoint-2825 Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhich clips in the video relate to the query 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] Starting Training! Resuming from checkpoint: outputs/aha/checkpoint-2825 Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat parts are relevant to the concept of 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] Starting Training! Resuming from checkpoint: outputs/aha/checkpoint-2825 ninja: no work to do. Time to load cpu_adam op: 2.2662734985351562 seconds Time to load cpu_adam op: 2.294476270675659 seconds ninja: no work to do. Time to load cpu_adam op: 2.2368359565734863 seconds ninja: no work to do. Time to load cpu_adam op: 2.2494730949401855 seconds [Rank 0] Trainer log: {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06}[Rank 2] Trainer log: {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06}[Rank 3] Trainer log: {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06} [Rank 1] Trainer log: {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06} {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06}[Rank 3] Trainer log: {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06}[Rank 1] Trainer log: {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06} [Rank 0] Trainer log: {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06} {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06}[Rank 1] Trainer log: {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06}[Rank 3] Trainer log: {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06} [Rank 0] Trainer log: {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06} {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06}[Rank 3] Trainer log: {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06}[Rank 1] Trainer log: {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06} [Rank 0] Trainer log: {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06} {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06, 'epoch': 0.58} [Rank 3] Trainer log: {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06}[Rank 2] Trainer log: {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06}[Rank 1] Trainer log: {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06} [Rank 0] Trainer log: {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06} {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06}[Rank 1] Trainer log: {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06} [Rank 3] Trainer log: {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06} [Rank 0] Trainer log: {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06} {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06, 'epoch': 0.58} [Rank 3] Trainer log: {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06} [Rank 0] Trainer log: {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06}[Rank 1] Trainer log: {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06} [Rank 2] Trainer log: {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06} {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06, 'epoch': 0.58} [Rank 1] Trainer log: {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06}[Rank 0] Trainer log: {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06} [Rank 2] Trainer log: {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06} [Rank 3] Trainer log: {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06} {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06}[Rank 3] Trainer log: {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06} [Rank 1] Trainer log: {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06} [Rank 0] Trainer log: {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06} {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06}[Rank 3] Trainer log: {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06} [Rank 1] Trainer log: {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06} [Rank 0] Trainer log: {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06} {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06, 'epoch': 0.58} [Rank 3] Trainer log: {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06}[Rank 1] Trainer log: {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06}[Rank 2] Trainer log: {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06} [Rank 0] Trainer log: {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06} {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06, 'epoch': 0.58} [Rank 1] Trainer log: {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06}[Rank 3] Trainer log: {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06}[Rank 2] Trainer log: {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06} [Rank 0] Trainer log: {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06} {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06, 'epoch': 0.58} [Rank 1] Trainer log: {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06}[Rank 2] Trainer log: {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06}[Rank 3] Trainer log: {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06} [Rank 0] Trainer log: {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06} {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06}[Rank 1] Trainer log: {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06}[Rank 3] Trainer log: {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06} [Rank 0] Trainer log: {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06} {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06}[Rank 1] Trainer log: {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06}[Rank 3] Trainer log: {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06} [Rank 0] Trainer log: {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06} {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06}[Rank 3] Trainer log: {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06}[Rank 1] Trainer log: {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06} [Rank 0] Trainer log: {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06} {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06, 'epoch': 0.59} [Rank 0] Trainer log: {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06}[Rank 3] Trainer log: {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06} [Rank 2] Trainer log: {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06}[Rank 1] Trainer log: {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06} {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06}[Rank 3] Trainer log: {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06}[Rank 0] Trainer log: {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06} [Rank 2] Trainer log: {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06} {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06} [Rank 2] Trainer log: {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06} [Rank 3] Trainer log: {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06} [Rank 0] Trainer log: {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06} {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06}[Rank 0] Trainer log: {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06}[Rank 2] Trainer log: {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06} [Rank 3] Trainer log: {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06} {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06}[Rank 1] Trainer log: {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06} [Rank 2] Trainer log: {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06} [Rank 0] Trainer log: {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06} {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06}[Rank 3] Trainer log: {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06}[Rank 1] Trainer log: {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06} [Rank 0] Trainer log: {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06} {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06}[Rank 2] Trainer log: {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06}[Rank 3] Trainer log: {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06} [Rank 0] Trainer log: {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06} {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06}[Rank 2] Trainer log: {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06}[Rank 0] Trainer log: {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06} [Rank 1] Trainer log: {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06} {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06} [Rank 2] Trainer log: {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06} [Rank 3] Trainer log: {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06} [Rank 0] Trainer log: {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06} {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06}[Rank 3] Trainer log: {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06}[Rank 1] Trainer log: {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06} [Rank 0] Trainer log: {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06} {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06, 'epoch': 0.59} [Rank 0] Trainer log: {'loss': 1.0383, 'grad_norm': 8.6812744140625, 'learning_rate': 7.931778968909596e-06}[Rank 3] Trainer log: {'loss': 1.0383, 'grad_norm': 8.6812744140625, 'learning_rate': 7.931778968909596e-06}[Rank 2] Trainer log: {'loss': 1.0383, 'grad_norm': 8.6812744140625, 'learning_rate': 7.931778968909596e-06} [Rank 1] Trainer log: {'loss': 1.0383, 'grad_norm': 8.6812744140625, 'learning_rate': 7.931778968909596e-06} {'loss': 1.0383, 'grad_norm': 8.6812744140625, 'learning_rate': 7.931778968909596e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.8599, 'grad_norm': 5.965872287750244, 'learning_rate': 7.925107718339058e-06}[Rank 1] Trainer log: {'loss': 0.8599, 'grad_norm': 5.965872287750244, 'learning_rate': 7.925107718339058e-06} [Rank 0] Trainer log: {'loss': 0.8599, 'grad_norm': 5.965872287750244, 'learning_rate': 7.925107718339058e-06}[Rank 3] Trainer log: {'loss': 0.8599, 'grad_norm': 5.965872287750244, 'learning_rate': 7.925107718339058e-06} {'loss': 0.8599, 'grad_norm': 5.965872287750244, 'learning_rate': 7.925107718339058e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.6043, 'grad_norm': 11.417373657226562, 'learning_rate': 7.918437432616431e-06}[Rank 2] Trainer log: {'loss': 0.6043, 'grad_norm': 11.417373657226562, 'learning_rate': 7.918437432616431e-06} [Rank 3] Trainer log: {'loss': 0.6043, 'grad_norm': 11.417373657226562, 'learning_rate': 7.918437432616431e-06} [Rank 0] Trainer log: {'loss': 0.6043, 'grad_norm': 11.417373657226562, 'learning_rate': 7.918437432616431e-06} {'loss': 0.6043, 'grad_norm': 11.417373657226562, 'learning_rate': 7.918437432616431e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.8246, 'grad_norm': 7.222879886627197, 'learning_rate': 7.911768114843468e-06} [Rank 0] Trainer log: {'loss': 0.8246, 'grad_norm': 7.222879886627197, 'learning_rate': 7.911768114843468e-06}[Rank 1] Trainer log: {'loss': 0.8246, 'grad_norm': 7.222879886627197, 'learning_rate': 7.911768114843468e-06}[Rank 2] Trainer log: {'loss': 0.8246, 'grad_norm': 7.222879886627197, 'learning_rate': 7.911768114843468e-06} {'loss': 0.8246, 'grad_norm': 7.222879886627197, 'learning_rate': 7.911768114843468e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.8302, 'grad_norm': 7.66157865524292, 'learning_rate': 7.905099768121473e-06}[Rank 1] Trainer log: {'loss': 0.8302, 'grad_norm': 7.66157865524292, 'learning_rate': 7.905099768121473e-06} [Rank 3] Trainer log: {'loss': 0.8302, 'grad_norm': 7.66157865524292, 'learning_rate': 7.905099768121473e-06} [Rank 0] Trainer log: {'loss': 0.8302, 'grad_norm': 7.66157865524292, 'learning_rate': 7.905099768121473e-06} {'loss': 0.8302, 'grad_norm': 7.66157865524292, 'learning_rate': 7.905099768121473e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.9553, 'grad_norm': 3.785317897796631, 'learning_rate': 7.898432395551306e-06}[Rank 2] Trainer log: {'loss': 0.9553, 'grad_norm': 3.785317897796631, 'learning_rate': 7.898432395551306e-06} [Rank 3] Trainer log: {'loss': 0.9553, 'grad_norm': 3.785317897796631, 'learning_rate': 7.898432395551306e-06} [Rank 0] Trainer log: {'loss': 0.9553, 'grad_norm': 3.785317897796631, 'learning_rate': 7.898432395551306e-06} {'loss': 0.9553, 'grad_norm': 3.785317897796631, 'learning_rate': 7.898432395551306e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.7008, 'grad_norm': 3.4257867336273193, 'learning_rate': 7.891766000233368e-06}[Rank 3] Trainer log: {'loss': 0.7008, 'grad_norm': 3.4257867336273193, 'learning_rate': 7.891766000233368e-06}[Rank 1] Trainer log: {'loss': 0.7008, 'grad_norm': 3.4257867336273193, 'learning_rate': 7.891766000233368e-06} [Rank 0] Trainer log: {'loss': 0.7008, 'grad_norm': 3.4257867336273193, 'learning_rate': 7.891766000233368e-06} {'loss': 0.7008, 'grad_norm': 3.4257867336273193, 'learning_rate': 7.891766000233368e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 1.0111, 'grad_norm': 11.607796669006348, 'learning_rate': 7.885100585267601e-06}[Rank 0] Trainer log: {'loss': 1.0111, 'grad_norm': 11.607796669006348, 'learning_rate': 7.885100585267601e-06} [Rank 2] Trainer log: {'loss': 1.0111, 'grad_norm': 11.607796669006348, 'learning_rate': 7.885100585267601e-06} [Rank 1] Trainer log: {'loss': 1.0111, 'grad_norm': 11.607796669006348, 'learning_rate': 7.885100585267601e-06} {'loss': 1.0111, 'grad_norm': 11.607796669006348, 'learning_rate': 7.885100585267601e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.933, 'grad_norm': 6.577983856201172, 'learning_rate': 7.878436153753508e-06}[Rank 0] Trainer log: {'loss': 0.933, 'grad_norm': 6.577983856201172, 'learning_rate': 7.878436153753508e-06}[Rank 1] Trainer log: {'loss': 0.933, 'grad_norm': 6.577983856201172, 'learning_rate': 7.878436153753508e-06} [Rank 2] Trainer log: {'loss': 0.933, 'grad_norm': 6.577983856201172, 'learning_rate': 7.878436153753508e-06} {'loss': 0.933, 'grad_norm': 6.577983856201172, 'learning_rate': 7.878436153753508e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.706, 'grad_norm': 9.482253074645996, 'learning_rate': 7.871772708790114e-06}[Rank 0] Trainer log: {'loss': 0.706, 'grad_norm': 9.482253074645996, 'learning_rate': 7.871772708790114e-06}[Rank 1] Trainer log: {'loss': 0.706, 'grad_norm': 9.482253074645996, 'learning_rate': 7.871772708790114e-06} [Rank 3] Trainer log: {'loss': 0.706, 'grad_norm': 9.482253074645996, 'learning_rate': 7.871772708790114e-06} {'loss': 0.706, 'grad_norm': 9.482253074645996, 'learning_rate': 7.871772708790114e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.8255, 'grad_norm': 2.3252604007720947, 'learning_rate': 7.865110253475996e-06} [Rank 1] Trainer log: {'loss': 0.8255, 'grad_norm': 2.3252604007720947, 'learning_rate': 7.865110253475996e-06} [Rank 0] Trainer log: {'loss': 0.8255, 'grad_norm': 2.3252604007720947, 'learning_rate': 7.865110253475996e-06}[Rank 2] Trainer log: {'loss': 0.8255, 'grad_norm': 2.3252604007720947, 'learning_rate': 7.865110253475996e-06} {'loss': 0.8255, 'grad_norm': 2.3252604007720947, 'learning_rate': 7.865110253475996e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 1.0067, 'grad_norm': 8.127254486083984, 'learning_rate': 7.858448790909275e-06}[Rank 0] Trainer log: {'loss': 1.0067, 'grad_norm': 8.127254486083984, 'learning_rate': 7.858448790909275e-06} [Rank 1] Trainer log: {'loss': 1.0067, 'grad_norm': 8.127254486083984, 'learning_rate': 7.858448790909275e-06}[Rank 2] Trainer log: {'loss': 1.0067, 'grad_norm': 8.127254486083984, 'learning_rate': 7.858448790909275e-06} {'loss': 1.0067, 'grad_norm': 8.127254486083984, 'learning_rate': 7.858448790909275e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.7498, 'grad_norm': 3.6645822525024414, 'learning_rate': 7.851788324187602e-06}[Rank 3] Trainer log: {'loss': 0.7498, 'grad_norm': 3.6645822525024414, 'learning_rate': 7.851788324187602e-06}[Rank 2] Trainer log: {'loss': 0.7498, 'grad_norm': 3.6645822525024414, 'learning_rate': 7.851788324187602e-06} [Rank 0] Trainer log: {'loss': 0.7498, 'grad_norm': 3.6645822525024414, 'learning_rate': 7.851788324187602e-06} {'loss': 0.7498, 'grad_norm': 3.6645822525024414, 'learning_rate': 7.851788324187602e-06, 'epoch': 0.59} [Rank 0] Trainer log: {'loss': 0.8672, 'grad_norm': 3.1093926429748535, 'learning_rate': 7.845128856408162e-06}[Rank 1] Trainer log: {'loss': 0.8672, 'grad_norm': 3.1093926429748535, 'learning_rate': 7.845128856408162e-06}[Rank 2] Trainer log: {'loss': 0.8672, 'grad_norm': 3.1093926429748535, 'learning_rate': 7.845128856408162e-06} [Rank 3] Trainer log: {'loss': 0.8672, 'grad_norm': 3.1093926429748535, 'learning_rate': 7.845128856408162e-06} {'loss': 0.8672, 'grad_norm': 3.1093926429748535, 'learning_rate': 7.845128856408162e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.8017, 'grad_norm': 10.54983139038086, 'learning_rate': 7.83847039066769e-06}[Rank 1] Trainer log: {'loss': 0.8017, 'grad_norm': 10.54983139038086, 'learning_rate': 7.83847039066769e-06}[Rank 2] Trainer log: {'loss': 0.8017, 'grad_norm': 10.54983139038086, 'learning_rate': 7.83847039066769e-06} [Rank 0] Trainer log: {'loss': 0.8017, 'grad_norm': 10.54983139038086, 'learning_rate': 7.83847039066769e-06} {'loss': 0.8017, 'grad_norm': 10.54983139038086, 'learning_rate': 7.83847039066769e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.8557, 'grad_norm': 4.682456970214844, 'learning_rate': 7.831812930062444e-06} [Rank 2] Trainer log: {'loss': 0.8557, 'grad_norm': 4.682456970214844, 'learning_rate': 7.831812930062444e-06}[Rank 3] Trainer log: {'loss': 0.8557, 'grad_norm': 4.682456970214844, 'learning_rate': 7.831812930062444e-06}[Rank 0] Trainer log: {'loss': 0.8557, 'grad_norm': 4.682456970214844, 'learning_rate': 7.831812930062444e-06} {'loss': 0.8557, 'grad_norm': 4.682456970214844, 'learning_rate': 7.831812930062444e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.7568, 'grad_norm': 4.295328617095947, 'learning_rate': 7.825156477688212e-06}[Rank 0] Trainer log: {'loss': 0.7568, 'grad_norm': 4.295328617095947, 'learning_rate': 7.825156477688212e-06} [Rank 1] Trainer log: {'loss': 0.7568, 'grad_norm': 4.295328617095947, 'learning_rate': 7.825156477688212e-06} [Rank 3] Trainer log: {'loss': 0.7568, 'grad_norm': 4.295328617095947, 'learning_rate': 7.825156477688212e-06} {'loss': 0.7568, 'grad_norm': 4.295328617095947, 'learning_rate': 7.825156477688212e-06, 'epoch': 0.59} [Rank 0] Trainer log: {'loss': 0.8771, 'grad_norm': 4.573944568634033, 'learning_rate': 7.818501036640325e-06}[Rank 3] Trainer log: {'loss': 0.8771, 'grad_norm': 4.573944568634033, 'learning_rate': 7.818501036640325e-06}[Rank 2] Trainer log: {'loss': 0.8771, 'grad_norm': 4.573944568634033, 'learning_rate': 7.818501036640325e-06} [Rank 1] Trainer log: {'loss': 0.8771, 'grad_norm': 4.573944568634033, 'learning_rate': 7.818501036640325e-06} {'loss': 0.8771, 'grad_norm': 4.573944568634033, 'learning_rate': 7.818501036640325e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.9974, 'grad_norm': 8.324922561645508, 'learning_rate': 7.811846610013634e-06} [Rank 2] Trainer log: {'loss': 0.9974, 'grad_norm': 8.324922561645508, 'learning_rate': 7.811846610013634e-06}[Rank 1] Trainer log: {'loss': 0.9974, 'grad_norm': 8.324922561645508, 'learning_rate': 7.811846610013634e-06} [Rank 0] Trainer log: {'loss': 0.9974, 'grad_norm': 8.324922561645508, 'learning_rate': 7.811846610013634e-06} {'loss': 0.9974, 'grad_norm': 8.324922561645508, 'learning_rate': 7.811846610013634e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.8023, 'grad_norm': 7.568994998931885, 'learning_rate': 7.805193200902517e-06}[Rank 2] Trainer log: {'loss': 0.8023, 'grad_norm': 7.568994998931885, 'learning_rate': 7.805193200902517e-06} [Rank 0] Trainer log: {'loss': 0.8023, 'grad_norm': 7.568994998931885, 'learning_rate': 7.805193200902517e-06}[Rank 1] Trainer log: {'loss': 0.8023, 'grad_norm': 7.568994998931885, 'learning_rate': 7.805193200902517e-06} {'loss': 0.8023, 'grad_norm': 7.568994998931885, 'learning_rate': 7.805193200902517e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.9437, 'grad_norm': 2.870285749435425, 'learning_rate': 7.798540812400892e-06}[Rank 2] Trainer log: {'loss': 0.9437, 'grad_norm': 2.870285749435425, 'learning_rate': 7.798540812400892e-06}[Rank 1] Trainer log: {'loss': 0.9437, 'grad_norm': 2.870285749435425, 'learning_rate': 7.798540812400892e-06} [Rank 0] Trainer log: {'loss': 0.9437, 'grad_norm': 2.870285749435425, 'learning_rate': 7.798540812400892e-06} {'loss': 0.9437, 'grad_norm': 2.870285749435425, 'learning_rate': 7.798540812400892e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.9212, 'grad_norm': 4.655298709869385, 'learning_rate': 7.791889447602188e-06}[Rank 2] Trainer log: {'loss': 0.9212, 'grad_norm': 4.655298709869385, 'learning_rate': 7.791889447602188e-06}[Rank 0] Trainer log: {'loss': 0.9212, 'grad_norm': 4.655298709869385, 'learning_rate': 7.791889447602188e-06} [Rank 1] Trainer log: {'loss': 0.9212, 'grad_norm': 4.655298709869385, 'learning_rate': 7.791889447602188e-06} {'loss': 0.9212, 'grad_norm': 4.655298709869385, 'learning_rate': 7.791889447602188e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.7062, 'grad_norm': 6.147986888885498, 'learning_rate': 7.785239109599361e-06}[Rank 2] Trainer log: {'loss': 0.7062, 'grad_norm': 6.147986888885498, 'learning_rate': 7.785239109599361e-06}[Rank 1] Trainer log: {'loss': 0.7062, 'grad_norm': 6.147986888885498, 'learning_rate': 7.785239109599361e-06} [Rank 0] Trainer log: {'loss': 0.7062, 'grad_norm': 6.147986888885498, 'learning_rate': 7.785239109599361e-06} {'loss': 0.7062, 'grad_norm': 6.147986888885498, 'learning_rate': 7.785239109599361e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.9761, 'grad_norm': 4.22588586807251, 'learning_rate': 7.778589801484893e-06}[Rank 3] Trainer log: {'loss': 0.9761, 'grad_norm': 4.22588586807251, 'learning_rate': 7.778589801484893e-06} [Rank 1] Trainer log: {'loss': 0.9761, 'grad_norm': 4.22588586807251, 'learning_rate': 7.778589801484893e-06} [Rank 0] Trainer log: {'loss': 0.9761, 'grad_norm': 4.22588586807251, 'learning_rate': 7.778589801484893e-06} {'loss': 0.9761, 'grad_norm': 4.22588586807251, 'learning_rate': 7.778589801484893e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.8981, 'grad_norm': 9.767950057983398, 'learning_rate': 7.771941526350793e-06} [Rank 1] Trainer log: {'loss': 0.8981, 'grad_norm': 9.767950057983398, 'learning_rate': 7.771941526350793e-06} [Rank 0] Trainer log: {'loss': 0.8981, 'grad_norm': 9.767950057983398, 'learning_rate': 7.771941526350793e-06}[Rank 2] Trainer log: {'loss': 0.8981, 'grad_norm': 9.767950057983398, 'learning_rate': 7.771941526350793e-06} {'loss': 0.8981, 'grad_norm': 9.767950057983398, 'learning_rate': 7.771941526350793e-06, 'epoch': 0.59} [Rank 0] Trainer log: {'loss': 0.9711, 'grad_norm': 6.519673824310303, 'learning_rate': 7.765294287288573e-06}[Rank 1] Trainer log: {'loss': 0.9711, 'grad_norm': 6.519673824310303, 'learning_rate': 7.765294287288573e-06} [Rank 2] Trainer log: {'loss': 0.9711, 'grad_norm': 6.519673824310303, 'learning_rate': 7.765294287288573e-06} [Rank 3] Trainer log: {'loss': 0.9711, 'grad_norm': 6.519673824310303, 'learning_rate': 7.765294287288573e-06} {'loss': 0.9711, 'grad_norm': 6.519673824310303, 'learning_rate': 7.765294287288573e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.55, 'grad_norm': 8.52702808380127, 'learning_rate': 7.758648087389277e-06}[Rank 0] Trainer log: {'loss': 0.55, 'grad_norm': 8.52702808380127, 'learning_rate': 7.758648087389277e-06} [Rank 2] Trainer log: {'loss': 0.55, 'grad_norm': 8.52702808380127, 'learning_rate': 7.758648087389277e-06} [Rank 1] Trainer log: {'loss': 0.55, 'grad_norm': 8.52702808380127, 'learning_rate': 7.758648087389277e-06} {'loss': 0.55, 'grad_norm': 8.52702808380127, 'learning_rate': 7.758648087389277e-06, 'epoch': 0.59} [Rank 0] Trainer log: {'loss': 1.0937, 'grad_norm': 5.842692852020264, 'learning_rate': 7.752002929743463e-06}[Rank 2] Trainer log: {'loss': 1.0937, 'grad_norm': 5.842692852020264, 'learning_rate': 7.752002929743463e-06}[Rank 1] Trainer log: {'loss': 1.0937, 'grad_norm': 5.842692852020264, 'learning_rate': 7.752002929743463e-06} [Rank 3] Trainer log: {'loss': 1.0937, 'grad_norm': 5.842692852020264, 'learning_rate': 7.752002929743463e-06} {'loss': 1.0937, 'grad_norm': 5.842692852020264, 'learning_rate': 7.752002929743463e-06, 'epoch': 0.59} [Rank 0] Trainer log: {'loss': 0.9637, 'grad_norm': 3.189523458480835, 'learning_rate': 7.745358817441203e-06}[Rank 3] Trainer log: {'loss': 0.9637, 'grad_norm': 3.189523458480835, 'learning_rate': 7.745358817441203e-06} [Rank 1] Trainer log: {'loss': 0.9637, 'grad_norm': 3.189523458480835, 'learning_rate': 7.745358817441203e-06} [Rank 2] Trainer log: {'loss': 0.9637, 'grad_norm': 3.189523458480835, 'learning_rate': 7.745358817441203e-06} {'loss': 0.9637, 'grad_norm': 3.189523458480835, 'learning_rate': 7.745358817441203e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.7886, 'grad_norm': 5.609487056732178, 'learning_rate': 7.738715753572078e-06}[Rank 3] Trainer log: {'loss': 0.7886, 'grad_norm': 5.609487056732178, 'learning_rate': 7.738715753572078e-06} [Rank 0] Trainer log: {'loss': 0.7886, 'grad_norm': 5.609487056732178, 'learning_rate': 7.738715753572078e-06}[Rank 2] Trainer log: {'loss': 0.7886, 'grad_norm': 5.609487056732178, 'learning_rate': 7.738715753572078e-06} {'loss': 0.7886, 'grad_norm': 5.609487056732178, 'learning_rate': 7.738715753572078e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.8229, 'grad_norm': 3.829141139984131, 'learning_rate': 7.732073741225191e-06}[Rank 1] Trainer log: {'loss': 0.8229, 'grad_norm': 3.829141139984131, 'learning_rate': 7.732073741225191e-06}[Rank 3] Trainer log: {'loss': 0.8229, 'grad_norm': 3.829141139984131, 'learning_rate': 7.732073741225191e-06} [Rank 0] Trainer log: {'loss': 0.8229, 'grad_norm': 3.829141139984131, 'learning_rate': 7.732073741225191e-06} {'loss': 0.8229, 'grad_norm': 3.829141139984131, 'learning_rate': 7.732073741225191e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.7462, 'grad_norm': 4.6888275146484375, 'learning_rate': 7.725432783489152e-06} [Rank 0] Trainer log: {'loss': 0.7462, 'grad_norm': 4.6888275146484375, 'learning_rate': 7.725432783489152e-06}[Rank 2] Trainer log: {'loss': 0.7462, 'grad_norm': 4.6888275146484375, 'learning_rate': 7.725432783489152e-06}[Rank 1] Trainer log: {'loss': 0.7462, 'grad_norm': 4.6888275146484375, 'learning_rate': 7.725432783489152e-06} {'loss': 0.7462, 'grad_norm': 4.6888275146484375, 'learning_rate': 7.725432783489152e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.883, 'grad_norm': 6.723518371582031, 'learning_rate': 7.718792883452074e-06}[Rank 3] Trainer log: {'loss': 0.883, 'grad_norm': 6.723518371582031, 'learning_rate': 7.718792883452074e-06}[Rank 1] Trainer log: {'loss': 0.883, 'grad_norm': 6.723518371582031, 'learning_rate': 7.718792883452074e-06} [Rank 0] Trainer log: {'loss': 0.883, 'grad_norm': 6.723518371582031, 'learning_rate': 7.718792883452074e-06} {'loss': 0.883, 'grad_norm': 6.723518371582031, 'learning_rate': 7.718792883452074e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.8181, 'grad_norm': 1.5881062746047974, 'learning_rate': 7.712154044201592e-06}[Rank 2] Trainer log: {'loss': 0.8181, 'grad_norm': 1.5881062746047974, 'learning_rate': 7.712154044201592e-06} [Rank 3] Trainer log: {'loss': 0.8181, 'grad_norm': 1.5881062746047974, 'learning_rate': 7.712154044201592e-06} [Rank 0] Trainer log: {'loss': 0.8181, 'grad_norm': 1.5881062746047974, 'learning_rate': 7.712154044201592e-06} {'loss': 0.8181, 'grad_norm': 1.5881062746047974, 'learning_rate': 7.712154044201592e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.7008, 'grad_norm': 9.28088092803955, 'learning_rate': 7.705516268824835e-06}[Rank 3] Trainer log: {'loss': 0.7008, 'grad_norm': 9.28088092803955, 'learning_rate': 7.705516268824835e-06} [Rank 1] Trainer log: {'loss': 0.7008, 'grad_norm': 9.28088092803955, 'learning_rate': 7.705516268824835e-06} [Rank 0] Trainer log: {'loss': 0.7008, 'grad_norm': 9.28088092803955, 'learning_rate': 7.705516268824835e-06} {'loss': 0.7008, 'grad_norm': 9.28088092803955, 'learning_rate': 7.705516268824835e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.591, 'grad_norm': 5.0907416343688965, 'learning_rate': 7.698879560408442e-06}[Rank 0] Trainer log: {'loss': 0.591, 'grad_norm': 5.0907416343688965, 'learning_rate': 7.698879560408442e-06}[Rank 2] Trainer log: {'loss': 0.591, 'grad_norm': 5.0907416343688965, 'learning_rate': 7.698879560408442e-06} [Rank 3] Trainer log: {'loss': 0.591, 'grad_norm': 5.0907416343688965, 'learning_rate': 7.698879560408442e-06} {'loss': 0.591, 'grad_norm': 5.0907416343688965, 'learning_rate': 7.698879560408442e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.9497, 'grad_norm': 8.395511627197266, 'learning_rate': 7.692243922038558e-06} [Rank 1] Trainer log: {'loss': 0.9497, 'grad_norm': 8.395511627197266, 'learning_rate': 7.692243922038558e-06}[Rank 0] Trainer log: {'loss': 0.9497, 'grad_norm': 8.395511627197266, 'learning_rate': 7.692243922038558e-06} [Rank 2] Trainer log: {'loss': 0.9497, 'grad_norm': 8.395511627197266, 'learning_rate': 7.692243922038558e-06} {'loss': 0.9497, 'grad_norm': 8.395511627197266, 'learning_rate': 7.692243922038558e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.8161, 'grad_norm': 5.2689619064331055, 'learning_rate': 7.68560935680083e-06}[Rank 3] Trainer log: {'loss': 0.8161, 'grad_norm': 5.2689619064331055, 'learning_rate': 7.68560935680083e-06} [Rank 0] Trainer log: {'loss': 0.8161, 'grad_norm': 5.2689619064331055, 'learning_rate': 7.68560935680083e-06}[Rank 1] Trainer log: {'loss': 0.8161, 'grad_norm': 5.2689619064331055, 'learning_rate': 7.68560935680083e-06} {'loss': 0.8161, 'grad_norm': 5.2689619064331055, 'learning_rate': 7.68560935680083e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.8257, 'grad_norm': 6.594715595245361, 'learning_rate': 7.678975867780396e-06}[Rank 0] Trainer log: {'loss': 0.8257, 'grad_norm': 6.594715595245361, 'learning_rate': 7.678975867780396e-06}[Rank 1] Trainer log: {'loss': 0.8257, 'grad_norm': 6.594715595245361, 'learning_rate': 7.678975867780396e-06} [Rank 3] Trainer log: {'loss': 0.8257, 'grad_norm': 6.594715595245361, 'learning_rate': 7.678975867780396e-06} {'loss': 0.8257, 'grad_norm': 6.594715595245361, 'learning_rate': 7.678975867780396e-06, 'epoch': 0.6} [Rank 0] Trainer log: {'loss': 0.6979, 'grad_norm': 7.128732204437256, 'learning_rate': 7.67234345806191e-06}[Rank 2] Trainer log: {'loss': 0.6979, 'grad_norm': 7.128732204437256, 'learning_rate': 7.67234345806191e-06}[Rank 3] Trainer log: {'loss': 0.6979, 'grad_norm': 7.128732204437256, 'learning_rate': 7.67234345806191e-06} [Rank 1] Trainer log: {'loss': 0.6979, 'grad_norm': 7.128732204437256, 'learning_rate': 7.67234345806191e-06} {'loss': 0.6979, 'grad_norm': 7.128732204437256, 'learning_rate': 7.67234345806191e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.6405, 'grad_norm': 3.807426929473877, 'learning_rate': 7.665712130729516e-06}[Rank 2] Trainer log: {'loss': 0.6405, 'grad_norm': 3.807426929473877, 'learning_rate': 7.665712130729516e-06}[Rank 0] Trainer log: {'loss': 0.6405, 'grad_norm': 3.807426929473877, 'learning_rate': 7.665712130729516e-06} [Rank 3] Trainer log: {'loss': 0.6405, 'grad_norm': 3.807426929473877, 'learning_rate': 7.665712130729516e-06} {'loss': 0.6405, 'grad_norm': 3.807426929473877, 'learning_rate': 7.665712130729516e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.8507, 'grad_norm': 3.1542508602142334, 'learning_rate': 7.659081888866848e-06} [Rank 0] Trainer log: {'loss': 0.8507, 'grad_norm': 3.1542508602142334, 'learning_rate': 7.659081888866848e-06}[Rank 1] Trainer log: {'loss': 0.8507, 'grad_norm': 3.1542508602142334, 'learning_rate': 7.659081888866848e-06} [Rank 2] Trainer log: {'loss': 0.8507, 'grad_norm': 3.1542508602142334, 'learning_rate': 7.659081888866848e-06} {'loss': 0.8507, 'grad_norm': 3.1542508602142334, 'learning_rate': 7.659081888866848e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.7124, 'grad_norm': 5.037204742431641, 'learning_rate': 7.652452735557052e-06}[Rank 1] Trainer log: {'loss': 0.7124, 'grad_norm': 5.037204742431641, 'learning_rate': 7.652452735557052e-06} [Rank 2] Trainer log: {'loss': 0.7124, 'grad_norm': 5.037204742431641, 'learning_rate': 7.652452735557052e-06}[Rank 0] Trainer log: {'loss': 0.7124, 'grad_norm': 5.037204742431641, 'learning_rate': 7.652452735557052e-06} {'loss': 0.7124, 'grad_norm': 5.037204742431641, 'learning_rate': 7.652452735557052e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.9356, 'grad_norm': 7.624131679534912, 'learning_rate': 7.64582467388275e-06} [Rank 2] Trainer log: {'loss': 0.9356, 'grad_norm': 7.624131679534912, 'learning_rate': 7.64582467388275e-06} [Rank 1] Trainer log: {'loss': 0.9356, 'grad_norm': 7.624131679534912, 'learning_rate': 7.64582467388275e-06}[Rank 0] Trainer log: {'loss': 0.9356, 'grad_norm': 7.624131679534912, 'learning_rate': 7.64582467388275e-06} {'loss': 0.9356, 'grad_norm': 7.624131679534912, 'learning_rate': 7.64582467388275e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.8407, 'grad_norm': 4.133847713470459, 'learning_rate': 7.639197706926062e-06}[Rank 0] Trainer log: {'loss': 0.8407, 'grad_norm': 4.133847713470459, 'learning_rate': 7.639197706926062e-06} [Rank 2] Trainer log: {'loss': 0.8407, 'grad_norm': 4.133847713470459, 'learning_rate': 7.639197706926062e-06} [Rank 1] Trainer log: {'loss': 0.8407, 'grad_norm': 4.133847713470459, 'learning_rate': 7.639197706926062e-06} {'loss': 0.8407, 'grad_norm': 4.133847713470459, 'learning_rate': 7.639197706926062e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.6898, 'grad_norm': 2.8003904819488525, 'learning_rate': 7.632571837768612e-06}[Rank 2] Trainer log: {'loss': 0.6898, 'grad_norm': 2.8003904819488525, 'learning_rate': 7.632571837768612e-06} [Rank 3] Trainer log: {'loss': 0.6898, 'grad_norm': 2.8003904819488525, 'learning_rate': 7.632571837768612e-06} [Rank 0] Trainer log: {'loss': 0.6898, 'grad_norm': 2.8003904819488525, 'learning_rate': 7.632571837768612e-06} {'loss': 0.6898, 'grad_norm': 2.8003904819488525, 'learning_rate': 7.632571837768612e-06, 'epoch': 0.6} [Rank 0] Trainer log: {'loss': 0.7022, 'grad_norm': 2.067680835723877, 'learning_rate': 7.625947069491495e-06}[Rank 1] Trainer log: {'loss': 0.7022, 'grad_norm': 2.067680835723877, 'learning_rate': 7.625947069491495e-06}[Rank 3] Trainer log: {'loss': 0.7022, 'grad_norm': 2.067680835723877, 'learning_rate': 7.625947069491495e-06} [Rank 2] Trainer log: {'loss': 0.7022, 'grad_norm': 2.067680835723877, 'learning_rate': 7.625947069491495e-06} {'loss': 0.7022, 'grad_norm': 2.067680835723877, 'learning_rate': 7.625947069491495e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.5159, 'grad_norm': 1.5434848070144653, 'learning_rate': 7.619323405175299e-06}[Rank 1] Trainer log: {'loss': 0.5159, 'grad_norm': 1.5434848070144653, 'learning_rate': 7.619323405175299e-06}[Rank 2] Trainer log: {'loss': 0.5159, 'grad_norm': 1.5434848070144653, 'learning_rate': 7.619323405175299e-06} [Rank 0] Trainer log: {'loss': 0.5159, 'grad_norm': 1.5434848070144653, 'learning_rate': 7.619323405175299e-06} {'loss': 0.5159, 'grad_norm': 1.5434848070144653, 'learning_rate': 7.619323405175299e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.7351, 'grad_norm': 2.5101144313812256, 'learning_rate': 7.6127008479001065e-06}[Rank 3] Trainer log: {'loss': 0.7351, 'grad_norm': 2.5101144313812256, 'learning_rate': 7.6127008479001065e-06}[Rank 1] Trainer log: {'loss': 0.7351, 'grad_norm': 2.5101144313812256, 'learning_rate': 7.6127008479001065e-06} [Rank 0] Trainer log: {'loss': 0.7351, 'grad_norm': 2.5101144313812256, 'learning_rate': 7.6127008479001065e-06} {'loss': 0.7351, 'grad_norm': 2.5101144313812256, 'learning_rate': 7.6127008479001065e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.8064, 'grad_norm': 6.001944065093994, 'learning_rate': 7.606079400745481e-06} [Rank 2] Trainer log: {'loss': 0.8064, 'grad_norm': 6.001944065093994, 'learning_rate': 7.606079400745481e-06}[Rank 0] Trainer log: {'loss': 0.8064, 'grad_norm': 6.001944065093994, 'learning_rate': 7.606079400745481e-06}[Rank 3] Trainer log: {'loss': 0.8064, 'grad_norm': 6.001944065093994, 'learning_rate': 7.606079400745481e-06} {'loss': 0.8064, 'grad_norm': 6.001944065093994, 'learning_rate': 7.606079400745481e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.9561, 'grad_norm': 2.329766035079956, 'learning_rate': 7.599459066790466e-06}[Rank 0] Trainer log: {'loss': 0.9561, 'grad_norm': 2.329766035079956, 'learning_rate': 7.599459066790466e-06} [Rank 1] Trainer log: {'loss': 0.9561, 'grad_norm': 2.329766035079956, 'learning_rate': 7.599459066790466e-06}[Rank 2] Trainer log: {'loss': 0.9561, 'grad_norm': 2.329766035079956, 'learning_rate': 7.599459066790466e-06} {'loss': 0.9561, 'grad_norm': 2.329766035079956, 'learning_rate': 7.599459066790466e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.9417, 'grad_norm': 7.001340389251709, 'learning_rate': 7.592839849113586e-06}[Rank 3] Trainer log: {'loss': 0.9417, 'grad_norm': 7.001340389251709, 'learning_rate': 7.592839849113586e-06}[Rank 1] Trainer log: {'loss': 0.9417, 'grad_norm': 7.001340389251709, 'learning_rate': 7.592839849113586e-06} [Rank 0] Trainer log: {'loss': 0.9417, 'grad_norm': 7.001340389251709, 'learning_rate': 7.592839849113586e-06} {'loss': 0.9417, 'grad_norm': 7.001340389251709, 'learning_rate': 7.592839849113586e-06, 'epoch': 0.6} [Rank 0] Trainer log: {'loss': 0.4485, 'grad_norm': 6.775599479675293, 'learning_rate': 7.586221750792857e-06}[Rank 3] Trainer log: {'loss': 0.4485, 'grad_norm': 6.775599479675293, 'learning_rate': 7.586221750792857e-06} [Rank 1] Trainer log: {'loss': 0.4485, 'grad_norm': 6.775599479675293, 'learning_rate': 7.586221750792857e-06}[Rank 2] Trainer log: {'loss': 0.4485, 'grad_norm': 6.775599479675293, 'learning_rate': 7.586221750792857e-06} {'loss': 0.4485, 'grad_norm': 6.775599479675293, 'learning_rate': 7.586221750792857e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.6823, 'grad_norm': 3.220679521560669, 'learning_rate': 7.579604774905769e-06}[Rank 1] Trainer log: {'loss': 0.6823, 'grad_norm': 3.220679521560669, 'learning_rate': 7.579604774905769e-06}[Rank 2] Trainer log: {'loss': 0.6823, 'grad_norm': 3.220679521560669, 'learning_rate': 7.579604774905769e-06} [Rank 0] Trainer log: {'loss': 0.6823, 'grad_norm': 3.220679521560669, 'learning_rate': 7.579604774905769e-06} {'loss': 0.6823, 'grad_norm': 3.220679521560669, 'learning_rate': 7.579604774905769e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.5877, 'grad_norm': 8.885622024536133, 'learning_rate': 7.572988924529281e-06}[Rank 1] Trainer log: {'loss': 0.5877, 'grad_norm': 8.885622024536133, 'learning_rate': 7.572988924529281e-06} [Rank 3] Trainer log: {'loss': 0.5877, 'grad_norm': 8.885622024536133, 'learning_rate': 7.572988924529281e-06} [Rank 0] Trainer log: {'loss': 0.5877, 'grad_norm': 8.885622024536133, 'learning_rate': 7.572988924529281e-06} {'loss': 0.5877, 'grad_norm': 8.885622024536133, 'learning_rate': 7.572988924529281e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.9355, 'grad_norm': 3.3294732570648193, 'learning_rate': 7.566374202739848e-06}[Rank 3] Trainer log: {'loss': 0.9355, 'grad_norm': 3.3294732570648193, 'learning_rate': 7.566374202739848e-06}[Rank 2] Trainer log: {'loss': 0.9355, 'grad_norm': 3.3294732570648193, 'learning_rate': 7.566374202739848e-06} [Rank 0] Trainer log: {'loss': 0.9355, 'grad_norm': 3.3294732570648193, 'learning_rate': 7.566374202739848e-06} {'loss': 0.9355, 'grad_norm': 3.3294732570648193, 'learning_rate': 7.566374202739848e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.702, 'grad_norm': 4.3331098556518555, 'learning_rate': 7.55976061261338e-06}[Rank 2] Trainer log: {'loss': 0.702, 'grad_norm': 4.3331098556518555, 'learning_rate': 7.55976061261338e-06} [Rank 1] Trainer log: {'loss': 0.702, 'grad_norm': 4.3331098556518555, 'learning_rate': 7.55976061261338e-06} [Rank 0] Trainer log: {'loss': 0.702, 'grad_norm': 4.3331098556518555, 'learning_rate': 7.55976061261338e-06} {'loss': 0.702, 'grad_norm': 4.3331098556518555, 'learning_rate': 7.55976061261338e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.8101, 'grad_norm': 2.8455121517181396, 'learning_rate': 7.55314815722527e-06}[Rank 2] Trainer log: {'loss': 0.8101, 'grad_norm': 2.8455121517181396, 'learning_rate': 7.55314815722527e-06}[Rank 1] Trainer log: {'loss': 0.8101, 'grad_norm': 2.8455121517181396, 'learning_rate': 7.55314815722527e-06} [Rank 0] Trainer log: {'loss': 0.8101, 'grad_norm': 2.8455121517181396, 'learning_rate': 7.55314815722527e-06} {'loss': 0.8101, 'grad_norm': 2.8455121517181396, 'learning_rate': 7.55314815722527e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 1.0285, 'grad_norm': 8.942374229431152, 'learning_rate': 7.54653683965039e-06} [Rank 0] Trainer log: {'loss': 1.0285, 'grad_norm': 8.942374229431152, 'learning_rate': 7.54653683965039e-06}[Rank 2] Trainer log: {'loss': 1.0285, 'grad_norm': 8.942374229431152, 'learning_rate': 7.54653683965039e-06} [Rank 1] Trainer log: {'loss': 1.0285, 'grad_norm': 8.942374229431152, 'learning_rate': 7.54653683965039e-06} {'loss': 1.0285, 'grad_norm': 8.942374229431152, 'learning_rate': 7.54653683965039e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.9662, 'grad_norm': 8.60128116607666, 'learning_rate': 7.539926662963072e-06}[Rank 3] Trainer log: {'loss': 0.9662, 'grad_norm': 8.60128116607666, 'learning_rate': 7.539926662963072e-06} [Rank 2] Trainer log: {'loss': 0.9662, 'grad_norm': 8.60128116607666, 'learning_rate': 7.539926662963072e-06} [Rank 0] Trainer log: {'loss': 0.9662, 'grad_norm': 8.60128116607666, 'learning_rate': 7.539926662963072e-06} {'loss': 0.9662, 'grad_norm': 8.60128116607666, 'learning_rate': 7.539926662963072e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.9438, 'grad_norm': 4.245582580566406, 'learning_rate': 7.533317630237117e-06} [Rank 0] Trainer log: {'loss': 0.9438, 'grad_norm': 4.245582580566406, 'learning_rate': 7.533317630237117e-06}[Rank 1] Trainer log: {'loss': 0.9438, 'grad_norm': 4.245582580566406, 'learning_rate': 7.533317630237117e-06}[Rank 3] Trainer log: {'loss': 0.9438, 'grad_norm': 4.245582580566406, 'learning_rate': 7.533317630237117e-06} {'loss': 0.9438, 'grad_norm': 4.245582580566406, 'learning_rate': 7.533317630237117e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.8287, 'grad_norm': 2.34808087348938, 'learning_rate': 7.5267097445458035e-06}[Rank 3] Trainer log: {'loss': 0.8287, 'grad_norm': 2.34808087348938, 'learning_rate': 7.5267097445458035e-06}[Rank 2] Trainer log: {'loss': 0.8287, 'grad_norm': 2.34808087348938, 'learning_rate': 7.5267097445458035e-06} [Rank 0] Trainer log: {'loss': 0.8287, 'grad_norm': 2.34808087348938, 'learning_rate': 7.5267097445458035e-06} {'loss': 0.8287, 'grad_norm': 2.34808087348938, 'learning_rate': 7.5267097445458035e-06, 'epoch': 0.6} [Rank 0] Trainer log: {'loss': 0.8876, 'grad_norm': 3.167921304702759, 'learning_rate': 7.520103008961875e-06}[Rank 3] Trainer log: {'loss': 0.8876, 'grad_norm': 3.167921304702759, 'learning_rate': 7.520103008961875e-06} [Rank 2] Trainer log: {'loss': 0.8876, 'grad_norm': 3.167921304702759, 'learning_rate': 7.520103008961875e-06} [Rank 1] Trainer log: {'loss': 0.8876, 'grad_norm': 3.167921304702759, 'learning_rate': 7.520103008961875e-06} {'loss': 0.8876, 'grad_norm': 3.167921304702759, 'learning_rate': 7.520103008961875e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 1.0675, 'grad_norm': 7.022045135498047, 'learning_rate': 7.513497426557526e-06} [Rank 1] Trainer log: {'loss': 1.0675, 'grad_norm': 7.022045135498047, 'learning_rate': 7.513497426557526e-06} [Rank 0] Trainer log: {'loss': 1.0675, 'grad_norm': 7.022045135498047, 'learning_rate': 7.513497426557526e-06}[Rank 2] Trainer log: {'loss': 1.0675, 'grad_norm': 7.022045135498047, 'learning_rate': 7.513497426557526e-06} {'loss': 1.0675, 'grad_norm': 7.022045135498047, 'learning_rate': 7.513497426557526e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.6467, 'grad_norm': 3.044562339782715, 'learning_rate': 7.506893000404437e-06}[Rank 2] Trainer log: {'loss': 0.6467, 'grad_norm': 3.044562339782715, 'learning_rate': 7.506893000404437e-06}[Rank 3] Trainer log: {'loss': 0.6467, 'grad_norm': 3.044562339782715, 'learning_rate': 7.506893000404437e-06} [Rank 0] Trainer log: {'loss': 0.6467, 'grad_norm': 3.044562339782715, 'learning_rate': 7.506893000404437e-06} {'loss': 0.6467, 'grad_norm': 3.044562339782715, 'learning_rate': 7.506893000404437e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.884, 'grad_norm': 3.137956142425537, 'learning_rate': 7.500289733573733e-06}[Rank 3] Trainer log: {'loss': 0.884, 'grad_norm': 3.137956142425537, 'learning_rate': 7.500289733573733e-06} [Rank 1] Trainer log: {'loss': 0.884, 'grad_norm': 3.137956142425537, 'learning_rate': 7.500289733573733e-06} [Rank 0] Trainer log: {'loss': 0.884, 'grad_norm': 3.137956142425537, 'learning_rate': 7.500289733573733e-06} {'loss': 0.884, 'grad_norm': 3.137956142425537, 'learning_rate': 7.500289733573733e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.7913, 'grad_norm': 4.529285430908203, 'learning_rate': 7.493687629136004e-06}[Rank 0] Trainer log: {'loss': 0.7913, 'grad_norm': 4.529285430908203, 'learning_rate': 7.493687629136004e-06} [Rank 1] Trainer log: {'loss': 0.7913, 'grad_norm': 4.529285430908203, 'learning_rate': 7.493687629136004e-06}[Rank 2] Trainer log: {'loss': 0.7913, 'grad_norm': 4.529285430908203, 'learning_rate': 7.493687629136004e-06} {'loss': 0.7913, 'grad_norm': 4.529285430908203, 'learning_rate': 7.493687629136004e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.925, 'grad_norm': 19.357303619384766, 'learning_rate': 7.4870866901613094e-06}[Rank 3] Trainer log: {'loss': 0.925, 'grad_norm': 19.357303619384766, 'learning_rate': 7.4870866901613094e-06} [Rank 0] Trainer log: {'loss': 0.925, 'grad_norm': 19.357303619384766, 'learning_rate': 7.4870866901613094e-06} [Rank 1] Trainer log: {'loss': 0.925, 'grad_norm': 19.357303619384766, 'learning_rate': 7.4870866901613094e-06} {'loss': 0.925, 'grad_norm': 19.357303619384766, 'learning_rate': 7.4870866901613094e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.8531, 'grad_norm': 6.889987468719482, 'learning_rate': 7.480486919719157e-06}[Rank 0] Trainer log: {'loss': 0.8531, 'grad_norm': 6.889987468719482, 'learning_rate': 7.480486919719157e-06}[Rank 2] Trainer log: {'loss': 0.8531, 'grad_norm': 6.889987468719482, 'learning_rate': 7.480486919719157e-06} [Rank 1] Trainer log: {'loss': 0.8531, 'grad_norm': 6.889987468719482, 'learning_rate': 7.480486919719157e-06} {'loss': 0.8531, 'grad_norm': 6.889987468719482, 'learning_rate': 7.480486919719157e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.8107, 'grad_norm': 5.4776811599731445, 'learning_rate': 7.473888320878504e-06}[Rank 1] Trainer log: {'loss': 0.8107, 'grad_norm': 5.4776811599731445, 'learning_rate': 7.473888320878504e-06}[Rank 3] Trainer log: {'loss': 0.8107, 'grad_norm': 5.4776811599731445, 'learning_rate': 7.473888320878504e-06} [Rank 0] Trainer log: {'loss': 0.8107, 'grad_norm': 5.4776811599731445, 'learning_rate': 7.473888320878504e-06} {'loss': 0.8107, 'grad_norm': 5.4776811599731445, 'learning_rate': 7.473888320878504e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.936, 'grad_norm': 4.378544330596924, 'learning_rate': 7.467290896707783e-06}[Rank 0] Trainer log: {'loss': 0.936, 'grad_norm': 4.378544330596924, 'learning_rate': 7.467290896707783e-06}[Rank 2] Trainer log: {'loss': 0.936, 'grad_norm': 4.378544330596924, 'learning_rate': 7.467290896707783e-06} [Rank 3] Trainer log: {'loss': 0.936, 'grad_norm': 4.378544330596924, 'learning_rate': 7.467290896707783e-06} {'loss': 0.936, 'grad_norm': 4.378544330596924, 'learning_rate': 7.467290896707783e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.62, 'grad_norm': 8.740341186523438, 'learning_rate': 7.460694650274867e-06}[Rank 2] Trainer log: {'loss': 0.62, 'grad_norm': 8.740341186523438, 'learning_rate': 7.460694650274867e-06}[Rank 3] Trainer log: {'loss': 0.62, 'grad_norm': 8.740341186523438, 'learning_rate': 7.460694650274867e-06} [Rank 0] Trainer log: {'loss': 0.62, 'grad_norm': 8.740341186523438, 'learning_rate': 7.460694650274867e-06} {'loss': 0.62, 'grad_norm': 8.740341186523438, 'learning_rate': 7.460694650274867e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.7984, 'grad_norm': 1.7059037685394287, 'learning_rate': 7.454099584647079e-06}[Rank 2] Trainer log: {'loss': 0.7984, 'grad_norm': 1.7059037685394287, 'learning_rate': 7.454099584647079e-06}[Rank 1] Trainer log: {'loss': 0.7984, 'grad_norm': 1.7059037685394287, 'learning_rate': 7.454099584647079e-06} [Rank 0] Trainer log: {'loss': 0.7984, 'grad_norm': 1.7059037685394287, 'learning_rate': 7.454099584647079e-06} {'loss': 0.7984, 'grad_norm': 1.7059037685394287, 'learning_rate': 7.454099584647079e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.6062, 'grad_norm': 11.985270500183105, 'learning_rate': 7.447505702891202e-06} [Rank 3] Trainer log: {'loss': 0.6062, 'grad_norm': 11.985270500183105, 'learning_rate': 7.447505702891202e-06} [Rank 1] Trainer log: {'loss': 0.6062, 'grad_norm': 11.985270500183105, 'learning_rate': 7.447505702891202e-06} [Rank 0] Trainer log: {'loss': 0.6062, 'grad_norm': 11.985270500183105, 'learning_rate': 7.447505702891202e-06} {'loss': 0.6062, 'grad_norm': 11.985270500183105, 'learning_rate': 7.447505702891202e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.7176, 'grad_norm': 3.492624282836914, 'learning_rate': 7.440913008073464e-06} [Rank 0] Trainer log: {'loss': 0.7176, 'grad_norm': 3.492624282836914, 'learning_rate': 7.440913008073464e-06}[Rank 3] Trainer log: {'loss': 0.7176, 'grad_norm': 3.492624282836914, 'learning_rate': 7.440913008073464e-06} [Rank 2] Trainer log: {'loss': 0.7176, 'grad_norm': 3.492624282836914, 'learning_rate': 7.440913008073464e-06} {'loss': 0.7176, 'grad_norm': 3.492624282836914, 'learning_rate': 7.440913008073464e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.7116, 'grad_norm': 7.147670269012451, 'learning_rate': 7.434321503259541e-06} [Rank 3] Trainer log: {'loss': 0.7116, 'grad_norm': 7.147670269012451, 'learning_rate': 7.434321503259541e-06} [Rank 1] Trainer log: {'loss': 0.7116, 'grad_norm': 7.147670269012451, 'learning_rate': 7.434321503259541e-06} [Rank 0] Trainer log: {'loss': 0.7116, 'grad_norm': 7.147670269012451, 'learning_rate': 7.434321503259541e-06} {'loss': 0.7116, 'grad_norm': 7.147670269012451, 'learning_rate': 7.434321503259541e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.9376, 'grad_norm': 3.124603509902954, 'learning_rate': 7.427731191514549e-06}[Rank 1] Trainer log: {'loss': 0.9376, 'grad_norm': 3.124603509902954, 'learning_rate': 7.427731191514549e-06}[Rank 3] Trainer log: {'loss': 0.9376, 'grad_norm': 3.124603509902954, 'learning_rate': 7.427731191514549e-06} [Rank 0] Trainer log: {'loss': 0.9376, 'grad_norm': 3.124603509902954, 'learning_rate': 7.427731191514549e-06} {'loss': 0.9376, 'grad_norm': 3.124603509902954, 'learning_rate': 7.427731191514549e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.899, 'grad_norm': 10.070174217224121, 'learning_rate': 7.421142075903067e-06}[Rank 2] Trainer log: {'loss': 0.899, 'grad_norm': 10.070174217224121, 'learning_rate': 7.421142075903067e-06}[Rank 0] Trainer log: {'loss': 0.899, 'grad_norm': 10.070174217224121, 'learning_rate': 7.421142075903067e-06} [Rank 3] Trainer log: {'loss': 0.899, 'grad_norm': 10.070174217224121, 'learning_rate': 7.421142075903067e-06} {'loss': 0.899, 'grad_norm': 10.070174217224121, 'learning_rate': 7.421142075903067e-06, 'epoch': 0.6} [Rank 3] Trainer log: {'loss': 0.7946, 'grad_norm': 6.182849884033203, 'learning_rate': 7.4145541594891e-06}[Rank 2] Trainer log: {'loss': 0.7946, 'grad_norm': 6.182849884033203, 'learning_rate': 7.4145541594891e-06}[Rank 1] Trainer log: {'loss': 0.7946, 'grad_norm': 6.182849884033203, 'learning_rate': 7.4145541594891e-06} [Rank 0] Trainer log: {'loss': 0.7946, 'grad_norm': 6.182849884033203, 'learning_rate': 7.4145541594891e-06} {'loss': 0.7946, 'grad_norm': 6.182849884033203, 'learning_rate': 7.4145541594891e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.7273, 'grad_norm': 3.825021982192993, 'learning_rate': 7.407967445336101e-06}[Rank 2] Trainer log: {'loss': 0.7273, 'grad_norm': 3.825021982192993, 'learning_rate': 7.407967445336101e-06} [Rank 0] Trainer log: {'loss': 0.7273, 'grad_norm': 3.825021982192993, 'learning_rate': 7.407967445336101e-06} [Rank 3] Trainer log: {'loss': 0.7273, 'grad_norm': 3.825021982192993, 'learning_rate': 7.407967445336101e-06} {'loss': 0.7273, 'grad_norm': 3.825021982192993, 'learning_rate': 7.407967445336101e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.5881, 'grad_norm': 3.2093820571899414, 'learning_rate': 7.401381936506973e-06} [Rank 0] Trainer log: {'loss': 0.5881, 'grad_norm': 3.2093820571899414, 'learning_rate': 7.401381936506973e-06}[Rank 3] Trainer log: {'loss': 0.5881, 'grad_norm': 3.2093820571899414, 'learning_rate': 7.401381936506973e-06} [Rank 1] Trainer log: {'loss': 0.5881, 'grad_norm': 3.2093820571899414, 'learning_rate': 7.401381936506973e-06} {'loss': 0.5881, 'grad_norm': 3.2093820571899414, 'learning_rate': 7.401381936506973e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.7861, 'grad_norm': 3.333134412765503, 'learning_rate': 7.3947976360640435e-06} [Rank 0] Trainer log: {'loss': 0.7861, 'grad_norm': 3.333134412765503, 'learning_rate': 7.3947976360640435e-06}[Rank 2] Trainer log: {'loss': 0.7861, 'grad_norm': 3.333134412765503, 'learning_rate': 7.3947976360640435e-06} [Rank 3] Trainer log: {'loss': 0.7861, 'grad_norm': 3.333134412765503, 'learning_rate': 7.3947976360640435e-06} {'loss': 0.7861, 'grad_norm': 3.333134412765503, 'learning_rate': 7.3947976360640435e-06, 'epoch': 0.6} [Rank 1] Trainer log: {'loss': 0.9444, 'grad_norm': 2.585488796234131, 'learning_rate': 7.388214547069086e-06}[Rank 3] Trainer log: {'loss': 0.9444, 'grad_norm': 2.585488796234131, 'learning_rate': 7.388214547069086e-06} [Rank 0] Trainer log: {'loss': 0.9444, 'grad_norm': 2.585488796234131, 'learning_rate': 7.388214547069086e-06} [Rank 2] Trainer log: {'loss': 0.9444, 'grad_norm': 2.585488796234131, 'learning_rate': 7.388214547069086e-06} {'loss': 0.9444, 'grad_norm': 2.585488796234131, 'learning_rate': 7.388214547069086e-06, 'epoch': 0.6} [Rank 2] Trainer log: {'loss': 0.7711, 'grad_norm': 5.899515628814697, 'learning_rate': 7.38163267258331e-06}[Rank 1] Trainer log: {'loss': 0.7711, 'grad_norm': 5.899515628814697, 'learning_rate': 7.38163267258331e-06} [Rank 3] Trainer log: {'loss': 0.7711, 'grad_norm': 5.899515628814697, 'learning_rate': 7.38163267258331e-06} [Rank 0] Trainer log: {'loss': 0.7711, 'grad_norm': 5.899515628814697, 'learning_rate': 7.38163267258331e-06} {'loss': 0.7711, 'grad_norm': 5.899515628814697, 'learning_rate': 7.38163267258331e-06, 'epoch': 0.61} [Rank 3] Trainer log: {'loss': 0.9491, 'grad_norm': 4.906843185424805, 'learning_rate': 7.3750520156673646e-06}[Rank 2] Trainer log: {'loss': 0.9491, 'grad_norm': 4.906843185424805, 'learning_rate': 7.3750520156673646e-06} [Rank 0] Trainer log: {'loss': 0.9491, 'grad_norm': 4.906843185424805, 'learning_rate': 7.3750520156673646e-06}[Rank 1] Trainer log: {'loss': 0.9491, 'grad_norm': 4.906843185424805, 'learning_rate': 7.3750520156673646e-06} {'loss': 0.9491, 'grad_norm': 4.906843185424805, 'learning_rate': 7.3750520156673646e-06, 'epoch': 0.61} [Rank 3] Trainer log: {'loss': 0.9863, 'grad_norm': 3.3703064918518066, 'learning_rate': 7.368472579381319e-06}[Rank 0] Trainer log: {'loss': 0.9863, 'grad_norm': 3.3703064918518066, 'learning_rate': 7.368472579381319e-06} [Rank 1] Trainer log: {'loss': 0.9863, 'grad_norm': 3.3703064918518066, 'learning_rate': 7.368472579381319e-06} [Rank 2] Trainer log: {'loss': 0.9863, 'grad_norm': 3.3703064918518066, 'learning_rate': 7.368472579381319e-06} {'loss': 0.9863, 'grad_norm': 3.3703064918518066, 'learning_rate': 7.368472579381319e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.8371, 'grad_norm': 5.210656642913818, 'learning_rate': 7.3618943667846954e-06}[Rank 3] Trainer log: {'loss': 0.8371, 'grad_norm': 5.210656642913818, 'learning_rate': 7.3618943667846954e-06}[Rank 2] Trainer log: {'loss': 0.8371, 'grad_norm': 5.210656642913818, 'learning_rate': 7.3618943667846954e-06} [Rank 0] Trainer log: {'loss': 0.8371, 'grad_norm': 5.210656642913818, 'learning_rate': 7.3618943667846954e-06} {'loss': 0.8371, 'grad_norm': 5.210656642913818, 'learning_rate': 7.3618943667846954e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.7705, 'grad_norm': 5.886256694793701, 'learning_rate': 7.355317380936427e-06} [Rank 1] Trainer log: {'loss': 0.7705, 'grad_norm': 5.886256694793701, 'learning_rate': 7.355317380936427e-06}[Rank 3] Trainer log: {'loss': 0.7705, 'grad_norm': 5.886256694793701, 'learning_rate': 7.355317380936427e-06} [Rank 0] Trainer log: {'loss': 0.7705, 'grad_norm': 5.886256694793701, 'learning_rate': 7.355317380936427e-06} {'loss': 0.7705, 'grad_norm': 5.886256694793701, 'learning_rate': 7.355317380936427e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.9927, 'grad_norm': 6.430995941162109, 'learning_rate': 7.348741624894886e-06}[Rank 2] Trainer log: {'loss': 0.9927, 'grad_norm': 6.430995941162109, 'learning_rate': 7.348741624894886e-06} [Rank 3] Trainer log: {'loss': 0.9927, 'grad_norm': 6.430995941162109, 'learning_rate': 7.348741624894886e-06}[Rank 0] Trainer log: {'loss': 0.9927, 'grad_norm': 6.430995941162109, 'learning_rate': 7.348741624894886e-06} {'loss': 0.9927, 'grad_norm': 6.430995941162109, 'learning_rate': 7.348741624894886e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.8036, 'grad_norm': 2.6949384212493896, 'learning_rate': 7.342167101717876e-06}[Rank 2] Trainer log: {'loss': 0.8036, 'grad_norm': 2.6949384212493896, 'learning_rate': 7.342167101717876e-06} [Rank 1] Trainer log: {'loss': 0.8036, 'grad_norm': 2.6949384212493896, 'learning_rate': 7.342167101717876e-06}[Rank 3] Trainer log: {'loss': 0.8036, 'grad_norm': 2.6949384212493896, 'learning_rate': 7.342167101717876e-06} {'loss': 0.8036, 'grad_norm': 2.6949384212493896, 'learning_rate': 7.342167101717876e-06, 'epoch': 0.61} [Rank 3] Trainer log: {'loss': 0.7293, 'grad_norm': 4.636180877685547, 'learning_rate': 7.3355938144626235e-06}[Rank 2] Trainer log: {'loss': 0.7293, 'grad_norm': 4.636180877685547, 'learning_rate': 7.3355938144626235e-06}[Rank 1] Trainer log: {'loss': 0.7293, 'grad_norm': 4.636180877685547, 'learning_rate': 7.3355938144626235e-06} [Rank 0] Trainer log: {'loss': 0.7293, 'grad_norm': 4.636180877685547, 'learning_rate': 7.3355938144626235e-06} {'loss': 0.7293, 'grad_norm': 4.636180877685547, 'learning_rate': 7.3355938144626235e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.6979, 'grad_norm': 3.0416252613067627, 'learning_rate': 7.329021766185772e-06}[Rank 2] Trainer log: {'loss': 0.6979, 'grad_norm': 3.0416252613067627, 'learning_rate': 7.329021766185772e-06}[Rank 3] Trainer log: {'loss': 0.6979, 'grad_norm': 3.0416252613067627, 'learning_rate': 7.329021766185772e-06} [Rank 0] Trainer log: {'loss': 0.6979, 'grad_norm': 3.0416252613067627, 'learning_rate': 7.329021766185772e-06} {'loss': 0.6979, 'grad_norm': 3.0416252613067627, 'learning_rate': 7.329021766185772e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.5788, 'grad_norm': 4.403919219970703, 'learning_rate': 7.3224509599434036e-06}[Rank 3] Trainer log: {'loss': 0.5788, 'grad_norm': 4.403919219970703, 'learning_rate': 7.3224509599434036e-06}[Rank 0] Trainer log: {'loss': 0.5788, 'grad_norm': 4.403919219970703, 'learning_rate': 7.3224509599434036e-06} [Rank 1] Trainer log: {'loss': 0.5788, 'grad_norm': 4.403919219970703, 'learning_rate': 7.3224509599434036e-06} {'loss': 0.5788, 'grad_norm': 4.403919219970703, 'learning_rate': 7.3224509599434036e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.8944, 'grad_norm': 1.7660356760025024, 'learning_rate': 7.315881398791018e-06}[Rank 1] Trainer log: {'loss': 0.8944, 'grad_norm': 1.7660356760025024, 'learning_rate': 7.315881398791018e-06}[Rank 3] Trainer log: {'loss': 0.8944, 'grad_norm': 1.7660356760025024, 'learning_rate': 7.315881398791018e-06} [Rank 0] Trainer log: {'loss': 0.8944, 'grad_norm': 1.7660356760025024, 'learning_rate': 7.315881398791018e-06} {'loss': 0.8944, 'grad_norm': 1.7660356760025024, 'learning_rate': 7.315881398791018e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.8807, 'grad_norm': 3.6686367988586426, 'learning_rate': 7.3093130857835245e-06}[Rank 0] Trainer log: {'loss': 0.8807, 'grad_norm': 3.6686367988586426, 'learning_rate': 7.3093130857835245e-06}[Rank 3] Trainer log: {'loss': 0.8807, 'grad_norm': 3.6686367988586426, 'learning_rate': 7.3093130857835245e-06} [Rank 2] Trainer log: {'loss': 0.8807, 'grad_norm': 3.6686367988586426, 'learning_rate': 7.3093130857835245e-06} {'loss': 0.8807, 'grad_norm': 3.6686367988586426, 'learning_rate': 7.3093130857835245e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.775, 'grad_norm': 3.4137661457061768, 'learning_rate': 7.302746023975273e-06}[Rank 1] Trainer log: {'loss': 0.775, 'grad_norm': 3.4137661457061768, 'learning_rate': 7.302746023975273e-06}[Rank 3] Trainer log: {'loss': 0.775, 'grad_norm': 3.4137661457061768, 'learning_rate': 7.302746023975273e-06} [Rank 0] Trainer log: {'loss': 0.775, 'grad_norm': 3.4137661457061768, 'learning_rate': 7.302746023975273e-06} {'loss': 0.775, 'grad_norm': 3.4137661457061768, 'learning_rate': 7.302746023975273e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.8969, 'grad_norm': 4.437193393707275, 'learning_rate': 7.296180216420012e-06} [Rank 3] Trainer log: {'loss': 0.8969, 'grad_norm': 4.437193393707275, 'learning_rate': 7.296180216420012e-06} [Rank 0] Trainer log: {'loss': 0.8969, 'grad_norm': 4.437193393707275, 'learning_rate': 7.296180216420012e-06}[Rank 2] Trainer log: {'loss': 0.8969, 'grad_norm': 4.437193393707275, 'learning_rate': 7.296180216420012e-06} {'loss': 0.8969, 'grad_norm': 4.437193393707275, 'learning_rate': 7.296180216420012e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.7913, 'grad_norm': 4.22709846496582, 'learning_rate': 7.289615666170915e-06}[Rank 3] Trainer log: {'loss': 0.7913, 'grad_norm': 4.22709846496582, 'learning_rate': 7.289615666170915e-06} [Rank 1] Trainer log: {'loss': 0.7913, 'grad_norm': 4.22709846496582, 'learning_rate': 7.289615666170915e-06} [Rank 0] Trainer log: {'loss': 0.7913, 'grad_norm': 4.22709846496582, 'learning_rate': 7.289615666170915e-06} {'loss': 0.7913, 'grad_norm': 4.22709846496582, 'learning_rate': 7.289615666170915e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.8815, 'grad_norm': 3.790130853652954, 'learning_rate': 7.283052376280577e-06}[Rank 3] Trainer log: {'loss': 0.8815, 'grad_norm': 3.790130853652954, 'learning_rate': 7.283052376280577e-06} [Rank 1] Trainer log: {'loss': 0.8815, 'grad_norm': 3.790130853652954, 'learning_rate': 7.283052376280577e-06} [Rank 0] Trainer log: {'loss': 0.8815, 'grad_norm': 3.790130853652954, 'learning_rate': 7.283052376280577e-06} {'loss': 0.8815, 'grad_norm': 3.790130853652954, 'learning_rate': 7.283052376280577e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.6775, 'grad_norm': 8.832178115844727, 'learning_rate': 7.276490349800996e-06}[Rank 3] Trainer log: {'loss': 0.6775, 'grad_norm': 8.832178115844727, 'learning_rate': 7.276490349800996e-06}[Rank 0] Trainer log: {'loss': 0.6775, 'grad_norm': 8.832178115844727, 'learning_rate': 7.276490349800996e-06} [Rank 2] Trainer log: {'loss': 0.6775, 'grad_norm': 8.832178115844727, 'learning_rate': 7.276490349800996e-06} {'loss': 0.6775, 'grad_norm': 8.832178115844727, 'learning_rate': 7.276490349800996e-06, 'epoch': 0.61} [Rank 3] Trainer log: {'loss': 0.9449, 'grad_norm': 5.03374719619751, 'learning_rate': 7.269929589783584e-06}[Rank 0] Trainer log: {'loss': 0.9449, 'grad_norm': 5.03374719619751, 'learning_rate': 7.269929589783584e-06} [Rank 1] Trainer log: {'loss': 0.9449, 'grad_norm': 5.03374719619751, 'learning_rate': 7.269929589783584e-06}[Rank 2] Trainer log: {'loss': 0.9449, 'grad_norm': 5.03374719619751, 'learning_rate': 7.269929589783584e-06} {'loss': 0.9449, 'grad_norm': 5.03374719619751, 'learning_rate': 7.269929589783584e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.6889, 'grad_norm': 4.095062732696533, 'learning_rate': 7.263370099279173e-06}[Rank 3] Trainer log: {'loss': 0.6889, 'grad_norm': 4.095062732696533, 'learning_rate': 7.263370099279173e-06}[Rank 2] Trainer log: {'loss': 0.6889, 'grad_norm': 4.095062732696533, 'learning_rate': 7.263370099279173e-06} [Rank 1] Trainer log: {'loss': 0.6889, 'grad_norm': 4.095062732696533, 'learning_rate': 7.263370099279173e-06} {'loss': 0.6889, 'grad_norm': 4.095062732696533, 'learning_rate': 7.263370099279173e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.9449, 'grad_norm': 3.926485776901245, 'learning_rate': 7.2568118813379966e-06}[Rank 0] Trainer log: {'loss': 0.9449, 'grad_norm': 3.926485776901245, 'learning_rate': 7.2568118813379966e-06}[Rank 3] Trainer log: {'loss': 0.9449, 'grad_norm': 3.926485776901245, 'learning_rate': 7.2568118813379966e-06} [Rank 2] Trainer log: {'loss': 0.9449, 'grad_norm': 3.926485776901245, 'learning_rate': 7.2568118813379966e-06} {'loss': 0.9449, 'grad_norm': 3.926485776901245, 'learning_rate': 7.2568118813379966e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.964, 'grad_norm': 4.300642967224121, 'learning_rate': 7.250254939009697e-06}[Rank 1] Trainer log: {'loss': 0.964, 'grad_norm': 4.300642967224121, 'learning_rate': 7.250254939009697e-06} [Rank 2] Trainer log: {'loss': 0.964, 'grad_norm': 4.300642967224121, 'learning_rate': 7.250254939009697e-06} [Rank 3] Trainer log: {'loss': 0.964, 'grad_norm': 4.300642967224121, 'learning_rate': 7.250254939009697e-06} {'loss': 0.964, 'grad_norm': 4.300642967224121, 'learning_rate': 7.250254939009697e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.9625, 'grad_norm': 3.7080740928649902, 'learning_rate': 7.243699275343327e-06} [Rank 3] Trainer log: {'loss': 0.9625, 'grad_norm': 3.7080740928649902, 'learning_rate': 7.243699275343327e-06}[Rank 0] Trainer log: {'loss': 0.9625, 'grad_norm': 3.7080740928649902, 'learning_rate': 7.243699275343327e-06} [Rank 1] Trainer log: {'loss': 0.9625, 'grad_norm': 3.7080740928649902, 'learning_rate': 7.243699275343327e-06} {'loss': 0.9625, 'grad_norm': 3.7080740928649902, 'learning_rate': 7.243699275343327e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.8684, 'grad_norm': 5.22523832321167, 'learning_rate': 7.237144893387342e-06}[Rank 3] Trainer log: {'loss': 0.8684, 'grad_norm': 5.22523832321167, 'learning_rate': 7.237144893387342e-06}[Rank 1] Trainer log: {'loss': 0.8684, 'grad_norm': 5.22523832321167, 'learning_rate': 7.237144893387342e-06} [Rank 0] Trainer log: {'loss': 0.8684, 'grad_norm': 5.22523832321167, 'learning_rate': 7.237144893387342e-06} {'loss': 0.8684, 'grad_norm': 5.22523832321167, 'learning_rate': 7.237144893387342e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.5878, 'grad_norm': 9.511547088623047, 'learning_rate': 7.230591796189605e-06}[Rank 3] Trainer log: {'loss': 0.5878, 'grad_norm': 9.511547088623047, 'learning_rate': 7.230591796189605e-06}[Rank 0] Trainer log: {'loss': 0.5878, 'grad_norm': 9.511547088623047, 'learning_rate': 7.230591796189605e-06} [Rank 2] Trainer log: {'loss': 0.5878, 'grad_norm': 9.511547088623047, 'learning_rate': 7.230591796189605e-06} {'loss': 0.5878, 'grad_norm': 9.511547088623047, 'learning_rate': 7.230591796189605e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 1.0487, 'grad_norm': 4.135075092315674, 'learning_rate': 7.224039986797373e-06} [Rank 3] Trainer log: {'loss': 1.0487, 'grad_norm': 4.135075092315674, 'learning_rate': 7.224039986797373e-06} [Rank 1] Trainer log: {'loss': 1.0487, 'grad_norm': 4.135075092315674, 'learning_rate': 7.224039986797373e-06} [Rank 0] Trainer log: {'loss': 1.0487, 'grad_norm': 4.135075092315674, 'learning_rate': 7.224039986797373e-06} {'loss': 1.0487, 'grad_norm': 4.135075092315674, 'learning_rate': 7.224039986797373e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.8258, 'grad_norm': 3.0454306602478027, 'learning_rate': 7.2174894682573145e-06}[Rank 1] Trainer log: {'loss': 0.8258, 'grad_norm': 3.0454306602478027, 'learning_rate': 7.2174894682573145e-06}[Rank 3] Trainer log: {'loss': 0.8258, 'grad_norm': 3.0454306602478027, 'learning_rate': 7.2174894682573145e-06} [Rank 0] Trainer log: {'loss': 0.8258, 'grad_norm': 3.0454306602478027, 'learning_rate': 7.2174894682573145e-06} {'loss': 0.8258, 'grad_norm': 3.0454306602478027, 'learning_rate': 7.2174894682573145e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.7242, 'grad_norm': 9.588912963867188, 'learning_rate': 7.210940243615495e-06}[Rank 1] Trainer log: {'loss': 0.7242, 'grad_norm': 9.588912963867188, 'learning_rate': 7.210940243615495e-06}[Rank 3] Trainer log: {'loss': 0.7242, 'grad_norm': 9.588912963867188, 'learning_rate': 7.210940243615495e-06} [Rank 0] Trainer log: {'loss': 0.7242, 'grad_norm': 9.588912963867188, 'learning_rate': 7.210940243615495e-06} {'loss': 0.7242, 'grad_norm': 9.588912963867188, 'learning_rate': 7.210940243615495e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.7272, 'grad_norm': 4.758750915527344, 'learning_rate': 7.20439231591737e-06} [Rank 1] Trainer log: {'loss': 0.7272, 'grad_norm': 4.758750915527344, 'learning_rate': 7.20439231591737e-06}[Rank 3] Trainer log: {'loss': 0.7272, 'grad_norm': 4.758750915527344, 'learning_rate': 7.20439231591737e-06} [Rank 0] Trainer log: {'loss': 0.7272, 'grad_norm': 4.758750915527344, 'learning_rate': 7.20439231591737e-06} {'loss': 0.7272, 'grad_norm': 4.758750915527344, 'learning_rate': 7.20439231591737e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.6962, 'grad_norm': 9.748427391052246, 'learning_rate': 7.197845688207805e-06}[Rank 0] Trainer log: {'loss': 0.6962, 'grad_norm': 9.748427391052246, 'learning_rate': 7.197845688207805e-06}[Rank 1] Trainer log: {'loss': 0.6962, 'grad_norm': 9.748427391052246, 'learning_rate': 7.197845688207805e-06} [Rank 3] Trainer log: {'loss': 0.6962, 'grad_norm': 9.748427391052246, 'learning_rate': 7.197845688207805e-06} {'loss': 0.6962, 'grad_norm': 9.748427391052246, 'learning_rate': 7.197845688207805e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.85, 'grad_norm': 4.544228553771973, 'learning_rate': 7.1913003635310505e-06}[Rank 2] Trainer log: {'loss': 0.85, 'grad_norm': 4.544228553771973, 'learning_rate': 7.1913003635310505e-06} [Rank 3] Trainer log: {'loss': 0.85, 'grad_norm': 4.544228553771973, 'learning_rate': 7.1913003635310505e-06} [Rank 1] Trainer log: {'loss': 0.85, 'grad_norm': 4.544228553771973, 'learning_rate': 7.1913003635310505e-06} {'loss': 0.85, 'grad_norm': 4.544228553771973, 'learning_rate': 7.1913003635310505e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.8997, 'grad_norm': 7.64717960357666, 'learning_rate': 7.184756344930755e-06}[Rank 2] Trainer log: {'loss': 0.8997, 'grad_norm': 7.64717960357666, 'learning_rate': 7.184756344930755e-06} [Rank 1] Trainer log: {'loss': 0.8997, 'grad_norm': 7.64717960357666, 'learning_rate': 7.184756344930755e-06}[Rank 3] Trainer log: {'loss': 0.8997, 'grad_norm': 7.64717960357666, 'learning_rate': 7.184756344930755e-06} {'loss': 0.8997, 'grad_norm': 7.64717960357666, 'learning_rate': 7.184756344930755e-06, 'epoch': 0.61} [Rank 3] Trainer log: {'loss': 1.0506, 'grad_norm': 2.2732629776000977, 'learning_rate': 7.178213635449963e-06} [Rank 0] Trainer log: {'loss': 1.0506, 'grad_norm': 2.2732629776000977, 'learning_rate': 7.178213635449963e-06}[Rank 2] Trainer log: {'loss': 1.0506, 'grad_norm': 2.2732629776000977, 'learning_rate': 7.178213635449963e-06}[Rank 1] Trainer log: {'loss': 1.0506, 'grad_norm': 2.2732629776000977, 'learning_rate': 7.178213635449963e-06} {'loss': 1.0506, 'grad_norm': 2.2732629776000977, 'learning_rate': 7.178213635449963e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.8612, 'grad_norm': 3.907376766204834, 'learning_rate': 7.171672238131107e-06} [Rank 3] Trainer log: {'loss': 0.8612, 'grad_norm': 3.907376766204834, 'learning_rate': 7.171672238131107e-06} [Rank 1] Trainer log: {'loss': 0.8612, 'grad_norm': 3.907376766204834, 'learning_rate': 7.171672238131107e-06}[Rank 0] Trainer log: {'loss': 0.8612, 'grad_norm': 3.907376766204834, 'learning_rate': 7.171672238131107e-06} {'loss': 0.8612, 'grad_norm': 3.907376766204834, 'learning_rate': 7.171672238131107e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.624, 'grad_norm': 8.171183586120605, 'learning_rate': 7.165132156016002e-06}[Rank 1] Trainer log: {'loss': 0.624, 'grad_norm': 8.171183586120605, 'learning_rate': 7.165132156016002e-06} [Rank 0] Trainer log: {'loss': 0.624, 'grad_norm': 8.171183586120605, 'learning_rate': 7.165132156016002e-06}[Rank 3] Trainer log: {'loss': 0.624, 'grad_norm': 8.171183586120605, 'learning_rate': 7.165132156016002e-06} {'loss': 0.624, 'grad_norm': 8.171183586120605, 'learning_rate': 7.165132156016002e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.7779, 'grad_norm': 11.084290504455566, 'learning_rate': 7.158593392145865e-06}[Rank 3] Trainer log: {'loss': 0.7779, 'grad_norm': 11.084290504455566, 'learning_rate': 7.158593392145865e-06}[Rank 1] Trainer log: {'loss': 0.7779, 'grad_norm': 11.084290504455566, 'learning_rate': 7.158593392145865e-06} [Rank 2] Trainer log: {'loss': 0.7779, 'grad_norm': 11.084290504455566, 'learning_rate': 7.158593392145865e-06} {'loss': 0.7779, 'grad_norm': 11.084290504455566, 'learning_rate': 7.158593392145865e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.9557, 'grad_norm': 4.801253318786621, 'learning_rate': 7.152055949561294e-06}[Rank 1] Trainer log: {'loss': 0.9557, 'grad_norm': 4.801253318786621, 'learning_rate': 7.152055949561294e-06}[Rank 2] Trainer log: {'loss': 0.9557, 'grad_norm': 4.801253318786621, 'learning_rate': 7.152055949561294e-06} [Rank 3] Trainer log: {'loss': 0.9557, 'grad_norm': 4.801253318786621, 'learning_rate': 7.152055949561294e-06} {'loss': 0.9557, 'grad_norm': 4.801253318786621, 'learning_rate': 7.152055949561294e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.8259, 'grad_norm': 9.08299732208252, 'learning_rate': 7.145519831302268e-06}[Rank 2] Trainer log: {'loss': 0.8259, 'grad_norm': 9.08299732208252, 'learning_rate': 7.145519831302268e-06} [Rank 1] Trainer log: {'loss': 0.8259, 'grad_norm': 9.08299732208252, 'learning_rate': 7.145519831302268e-06} [Rank 3] Trainer log: {'loss': 0.8259, 'grad_norm': 9.08299732208252, 'learning_rate': 7.145519831302268e-06} {'loss': 0.8259, 'grad_norm': 9.08299732208252, 'learning_rate': 7.145519831302268e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.755, 'grad_norm': 7.008230209350586, 'learning_rate': 7.1389850404081595e-06}[Rank 2] Trainer log: {'loss': 0.755, 'grad_norm': 7.008230209350586, 'learning_rate': 7.1389850404081595e-06}[Rank 3] Trainer log: {'loss': 0.755, 'grad_norm': 7.008230209350586, 'learning_rate': 7.1389850404081595e-06} [Rank 0] Trainer log: {'loss': 0.755, 'grad_norm': 7.008230209350586, 'learning_rate': 7.1389850404081595e-06} {'loss': 0.755, 'grad_norm': 7.008230209350586, 'learning_rate': 7.1389850404081595e-06, 'epoch': 0.61} [Rank 3] Trainer log: {'loss': 0.8631, 'grad_norm': 6.800971508026123, 'learning_rate': 7.132451579917717e-06}[Rank 1] Trainer log: {'loss': 0.8631, 'grad_norm': 6.800971508026123, 'learning_rate': 7.132451579917717e-06}[Rank 2] Trainer log: {'loss': 0.8631, 'grad_norm': 6.800971508026123, 'learning_rate': 7.132451579917717e-06} [Rank 0] Trainer log: {'loss': 0.8631, 'grad_norm': 6.800971508026123, 'learning_rate': 7.132451579917717e-06} {'loss': 0.8631, 'grad_norm': 6.800971508026123, 'learning_rate': 7.132451579917717e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.7451, 'grad_norm': 5.662895679473877, 'learning_rate': 7.1259194528690665e-06}[Rank 1] Trainer log: {'loss': 0.7451, 'grad_norm': 5.662895679473877, 'learning_rate': 7.1259194528690665e-06} [Rank 3] Trainer log: {'loss': 0.7451, 'grad_norm': 5.662895679473877, 'learning_rate': 7.1259194528690665e-06} [Rank 0] Trainer log: {'loss': 0.7451, 'grad_norm': 5.662895679473877, 'learning_rate': 7.1259194528690665e-06} {'loss': 0.7451, 'grad_norm': 5.662895679473877, 'learning_rate': 7.1259194528690665e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.7084, 'grad_norm': 3.645235061645508, 'learning_rate': 7.119388662299729e-06}[Rank 3] Trainer log: {'loss': 0.7084, 'grad_norm': 3.645235061645508, 'learning_rate': 7.119388662299729e-06}[Rank 1] Trainer log: {'loss': 0.7084, 'grad_norm': 3.645235061645508, 'learning_rate': 7.119388662299729e-06} [Rank 0] Trainer log: {'loss': 0.7084, 'grad_norm': 3.645235061645508, 'learning_rate': 7.119388662299729e-06} {'loss': 0.7084, 'grad_norm': 3.645235061645508, 'learning_rate': 7.119388662299729e-06, 'epoch': 0.61} [Rank 3] Trainer log: {'loss': 0.8191, 'grad_norm': 4.453382968902588, 'learning_rate': 7.112859211246591e-06} [Rank 1] Trainer log: {'loss': 0.8191, 'grad_norm': 4.453382968902588, 'learning_rate': 7.112859211246591e-06} [Rank 0] Trainer log: {'loss': 0.8191, 'grad_norm': 4.453382968902588, 'learning_rate': 7.112859211246591e-06}[Rank 2] Trainer log: {'loss': 0.8191, 'grad_norm': 4.453382968902588, 'learning_rate': 7.112859211246591e-06} {'loss': 0.8191, 'grad_norm': 4.453382968902588, 'learning_rate': 7.112859211246591e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 0.8176, 'grad_norm': 6.46124792098999, 'learning_rate': 7.106331102745914e-06}[Rank 3] Trainer log: {'loss': 0.8176, 'grad_norm': 6.46124792098999, 'learning_rate': 7.106331102745914e-06} [Rank 1] Trainer log: {'loss': 0.8176, 'grad_norm': 6.46124792098999, 'learning_rate': 7.106331102745914e-06} [Rank 2] Trainer log: {'loss': 0.8176, 'grad_norm': 6.46124792098999, 'learning_rate': 7.106331102745914e-06} {'loss': 0.8176, 'grad_norm': 6.46124792098999, 'learning_rate': 7.106331102745914e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.5739, 'grad_norm': 12.511343955993652, 'learning_rate': 7.099804339833346e-06}[Rank 3] Trainer log: {'loss': 0.5739, 'grad_norm': 12.511343955993652, 'learning_rate': 7.099804339833346e-06} [Rank 0] Trainer log: {'loss': 0.5739, 'grad_norm': 12.511343955993652, 'learning_rate': 7.099804339833346e-06}[Rank 2] Trainer log: {'loss': 0.5739, 'grad_norm': 12.511343955993652, 'learning_rate': 7.099804339833346e-06} {'loss': 0.5739, 'grad_norm': 12.511343955993652, 'learning_rate': 7.099804339833346e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.973, 'grad_norm': 4.5996904373168945, 'learning_rate': 7.093278925543906e-06}[Rank 0] Trainer log: {'loss': 0.973, 'grad_norm': 4.5996904373168945, 'learning_rate': 7.093278925543906e-06} [Rank 2] Trainer log: {'loss': 0.973, 'grad_norm': 4.5996904373168945, 'learning_rate': 7.093278925543906e-06}[Rank 3] Trainer log: {'loss': 0.973, 'grad_norm': 4.5996904373168945, 'learning_rate': 7.093278925543906e-06} {'loss': 0.973, 'grad_norm': 4.5996904373168945, 'learning_rate': 7.093278925543906e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.9295, 'grad_norm': 3.054058313369751, 'learning_rate': 7.086754862911982e-06}[Rank 3] Trainer log: {'loss': 0.9295, 'grad_norm': 3.054058313369751, 'learning_rate': 7.086754862911982e-06} [Rank 2] Trainer log: {'loss': 0.9295, 'grad_norm': 3.054058313369751, 'learning_rate': 7.086754862911982e-06}[Rank 0] Trainer log: {'loss': 0.9295, 'grad_norm': 3.054058313369751, 'learning_rate': 7.086754862911982e-06} {'loss': 0.9295, 'grad_norm': 3.054058313369751, 'learning_rate': 7.086754862911982e-06, 'epoch': 0.61} [Rank 0] Trainer log: {'loss': 1.0748, 'grad_norm': 4.779277324676514, 'learning_rate': 7.080232154971329e-06}[Rank 1] Trainer log: {'loss': 1.0748, 'grad_norm': 4.779277324676514, 'learning_rate': 7.080232154971329e-06}[Rank 3] Trainer log: {'loss': 1.0748, 'grad_norm': 4.779277324676514, 'learning_rate': 7.080232154971329e-06} [Rank 2] Trainer log: {'loss': 1.0748, 'grad_norm': 4.779277324676514, 'learning_rate': 7.080232154971329e-06} {'loss': 1.0748, 'grad_norm': 4.779277324676514, 'learning_rate': 7.080232154971329e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.8743, 'grad_norm': 2.3404481410980225, 'learning_rate': 7.073710804755087e-06}[Rank 0] Trainer log: {'loss': 0.8743, 'grad_norm': 2.3404481410980225, 'learning_rate': 7.073710804755087e-06}[Rank 3] Trainer log: {'loss': 0.8743, 'grad_norm': 2.3404481410980225, 'learning_rate': 7.073710804755087e-06} [Rank 2] Trainer log: {'loss': 0.8743, 'grad_norm': 2.3404481410980225, 'learning_rate': 7.073710804755087e-06} {'loss': 0.8743, 'grad_norm': 2.3404481410980225, 'learning_rate': 7.073710804755087e-06, 'epoch': 0.61} [Rank 2] Trainer log: {'loss': 0.7443, 'grad_norm': 2.173382520675659, 'learning_rate': 7.067190815295754e-06}[Rank 1] Trainer log: {'loss': 0.7443, 'grad_norm': 2.173382520675659, 'learning_rate': 7.067190815295754e-06}[Rank 3] Trainer log: {'loss': 0.7443, 'grad_norm': 2.173382520675659, 'learning_rate': 7.067190815295754e-06} [Rank 0] Trainer log: {'loss': 0.7443, 'grad_norm': 2.173382520675659, 'learning_rate': 7.067190815295754e-06} {'loss': 0.7443, 'grad_norm': 2.173382520675659, 'learning_rate': 7.067190815295754e-06, 'epoch': 0.61} [Rank 1] Trainer log: {'loss': 0.8245, 'grad_norm': 10.832181930541992, 'learning_rate': 7.060672189625195e-06} [Rank 3] Trainer log: {'loss': 0.8245, 'grad_norm': 10.832181930541992, 'learning_rate': 7.060672189625195e-06}[Rank 0] Trainer log: {'loss': 0.8245, 'grad_norm': 10.832181930541992, 'learning_rate': 7.060672189625195e-06}[Rank 2] Trainer log: {'loss': 0.8245, 'grad_norm': 10.832181930541992, 'learning_rate': 7.060672189625195e-06} {'loss': 0.8245, 'grad_norm': 10.832181930541992, 'learning_rate': 7.060672189625195e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.7865, 'grad_norm': 3.9552204608917236, 'learning_rate': 7.054154930774648e-06}[Rank 3] Trainer log: {'loss': 0.7865, 'grad_norm': 3.9552204608917236, 'learning_rate': 7.054154930774648e-06} [Rank 2] Trainer log: {'loss': 0.7865, 'grad_norm': 3.9552204608917236, 'learning_rate': 7.054154930774648e-06} [Rank 0] Trainer log: {'loss': 0.7865, 'grad_norm': 3.9552204608917236, 'learning_rate': 7.054154930774648e-06} {'loss': 0.7865, 'grad_norm': 3.9552204608917236, 'learning_rate': 7.054154930774648e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.9748, 'grad_norm': 12.856810569763184, 'learning_rate': 7.047639041774705e-06}[Rank 0] Trainer log: {'loss': 0.9748, 'grad_norm': 12.856810569763184, 'learning_rate': 7.047639041774705e-06}[Rank 1] Trainer log: {'loss': 0.9748, 'grad_norm': 12.856810569763184, 'learning_rate': 7.047639041774705e-06} [Rank 3] Trainer log: {'loss': 0.9748, 'grad_norm': 12.856810569763184, 'learning_rate': 7.047639041774705e-06} {'loss': 0.9748, 'grad_norm': 12.856810569763184, 'learning_rate': 7.047639041774705e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.7858, 'grad_norm': 2.0539162158966064, 'learning_rate': 7.0411245256553275e-06}[Rank 3] Trainer log: {'loss': 0.7858, 'grad_norm': 2.0539162158966064, 'learning_rate': 7.0411245256553275e-06} [Rank 1] Trainer log: {'loss': 0.7858, 'grad_norm': 2.0539162158966064, 'learning_rate': 7.0411245256553275e-06} [Rank 0] Trainer log: {'loss': 0.7858, 'grad_norm': 2.0539162158966064, 'learning_rate': 7.0411245256553275e-06} {'loss': 0.7858, 'grad_norm': 2.0539162158966064, 'learning_rate': 7.0411245256553275e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.7428, 'grad_norm': 2.1773934364318848, 'learning_rate': 7.034611385445842e-06}[Rank 2] Trainer log: {'loss': 0.7428, 'grad_norm': 2.1773934364318848, 'learning_rate': 7.034611385445842e-06} [Rank 1] Trainer log: {'loss': 0.7428, 'grad_norm': 2.1773934364318848, 'learning_rate': 7.034611385445842e-06} [Rank 0] Trainer log: {'loss': 0.7428, 'grad_norm': 2.1773934364318848, 'learning_rate': 7.034611385445842e-06} {'loss': 0.7428, 'grad_norm': 2.1773934364318848, 'learning_rate': 7.034611385445842e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.8294, 'grad_norm': 7.068175792694092, 'learning_rate': 7.028099624174929e-06}[Rank 0] Trainer log: {'loss': 0.8294, 'grad_norm': 7.068175792694092, 'learning_rate': 7.028099624174929e-06}[Rank 3] Trainer log: {'loss': 0.8294, 'grad_norm': 7.068175792694092, 'learning_rate': 7.028099624174929e-06} [Rank 1] Trainer log: {'loss': 0.8294, 'grad_norm': 7.068175792694092, 'learning_rate': 7.028099624174929e-06} {'loss': 0.8294, 'grad_norm': 7.068175792694092, 'learning_rate': 7.028099624174929e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.7405, 'grad_norm': 2.61645770072937, 'learning_rate': 7.0215892448706255e-06}[Rank 2] Trainer log: {'loss': 0.7405, 'grad_norm': 2.61645770072937, 'learning_rate': 7.0215892448706255e-06}[Rank 1] Trainer log: {'loss': 0.7405, 'grad_norm': 2.61645770072937, 'learning_rate': 7.0215892448706255e-06} [Rank 0] Trainer log: {'loss': 0.7405, 'grad_norm': 2.61645770072937, 'learning_rate': 7.0215892448706255e-06} {'loss': 0.7405, 'grad_norm': 2.61645770072937, 'learning_rate': 7.0215892448706255e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.8724, 'grad_norm': 8.022522926330566, 'learning_rate': 7.015080250560334e-06} [Rank 2] Trainer log: {'loss': 0.8724, 'grad_norm': 8.022522926330566, 'learning_rate': 7.015080250560334e-06} [Rank 1] Trainer log: {'loss': 0.8724, 'grad_norm': 8.022522926330566, 'learning_rate': 7.015080250560334e-06} [Rank 0] Trainer log: {'loss': 0.8724, 'grad_norm': 8.022522926330566, 'learning_rate': 7.015080250560334e-06} {'loss': 0.8724, 'grad_norm': 8.022522926330566, 'learning_rate': 7.015080250560334e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.941, 'grad_norm': 2.197694778442383, 'learning_rate': 7.00857264427081e-06}[Rank 3] Trainer log: {'loss': 0.941, 'grad_norm': 2.197694778442383, 'learning_rate': 7.00857264427081e-06} [Rank 1] Trainer log: {'loss': 0.941, 'grad_norm': 2.197694778442383, 'learning_rate': 7.00857264427081e-06} [Rank 0] Trainer log: {'loss': 0.941, 'grad_norm': 2.197694778442383, 'learning_rate': 7.00857264427081e-06} {'loss': 0.941, 'grad_norm': 2.197694778442383, 'learning_rate': 7.00857264427081e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.8968, 'grad_norm': 4.1226911544799805, 'learning_rate': 7.002066429028158e-06}[Rank 1] Trainer log: {'loss': 0.8968, 'grad_norm': 4.1226911544799805, 'learning_rate': 7.002066429028158e-06}[Rank 2] Trainer log: {'loss': 0.8968, 'grad_norm': 4.1226911544799805, 'learning_rate': 7.002066429028158e-06} [Rank 0] Trainer log: {'loss': 0.8968, 'grad_norm': 4.1226911544799805, 'learning_rate': 7.002066429028158e-06} {'loss': 0.8968, 'grad_norm': 4.1226911544799805, 'learning_rate': 7.002066429028158e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.9747, 'grad_norm': 2.9670250415802, 'learning_rate': 6.995561607857847e-06}[Rank 3] Trainer log: {'loss': 0.9747, 'grad_norm': 2.9670250415802, 'learning_rate': 6.995561607857847e-06}[Rank 1] Trainer log: {'loss': 0.9747, 'grad_norm': 2.9670250415802, 'learning_rate': 6.995561607857847e-06} [Rank 0] Trainer log: {'loss': 0.9747, 'grad_norm': 2.9670250415802, 'learning_rate': 6.995561607857847e-06} {'loss': 0.9747, 'grad_norm': 2.9670250415802, 'learning_rate': 6.995561607857847e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.8489, 'grad_norm': 7.509688854217529, 'learning_rate': 6.989058183784685e-06} [Rank 2] Trainer log: {'loss': 0.8489, 'grad_norm': 7.509688854217529, 'learning_rate': 6.989058183784685e-06} [Rank 0] Trainer log: {'loss': 0.8489, 'grad_norm': 7.509688854217529, 'learning_rate': 6.989058183784685e-06}[Rank 3] Trainer log: {'loss': 0.8489, 'grad_norm': 7.509688854217529, 'learning_rate': 6.989058183784685e-06} {'loss': 0.8489, 'grad_norm': 7.509688854217529, 'learning_rate': 6.989058183784685e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.5975, 'grad_norm': 5.862546443939209, 'learning_rate': 6.9825561598328364e-06}[Rank 2] Trainer log: {'loss': 0.5975, 'grad_norm': 5.862546443939209, 'learning_rate': 6.9825561598328364e-06}[Rank 3] Trainer log: {'loss': 0.5975, 'grad_norm': 5.862546443939209, 'learning_rate': 6.9825561598328364e-06} [Rank 0] Trainer log: {'loss': 0.5975, 'grad_norm': 5.862546443939209, 'learning_rate': 6.9825561598328364e-06} {'loss': 0.5975, 'grad_norm': 5.862546443939209, 'learning_rate': 6.9825561598328364e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.9722, 'grad_norm': 1.9924631118774414, 'learning_rate': 6.976055539025819e-06}[Rank 2] Trainer log: {'loss': 0.9722, 'grad_norm': 1.9924631118774414, 'learning_rate': 6.976055539025819e-06}[Rank 3] Trainer log: {'loss': 0.9722, 'grad_norm': 1.9924631118774414, 'learning_rate': 6.976055539025819e-06} [Rank 0] Trainer log: {'loss': 0.9722, 'grad_norm': 1.9924631118774414, 'learning_rate': 6.976055539025819e-06} {'loss': 0.9722, 'grad_norm': 1.9924631118774414, 'learning_rate': 6.976055539025819e-06, 'epoch': 0.62} [Rank 0] Trainer log: {'loss': 0.8671, 'grad_norm': 6.281204700469971, 'learning_rate': 6.9695563243864915e-06}[Rank 2] Trainer log: {'loss': 0.8671, 'grad_norm': 6.281204700469971, 'learning_rate': 6.9695563243864915e-06}[Rank 1] Trainer log: {'loss': 0.8671, 'grad_norm': 6.281204700469971, 'learning_rate': 6.9695563243864915e-06} [Rank 3] Trainer log: {'loss': 0.8671, 'grad_norm': 6.281204700469971, 'learning_rate': 6.9695563243864915e-06} {'loss': 0.8671, 'grad_norm': 6.281204700469971, 'learning_rate': 6.9695563243864915e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.7572, 'grad_norm': 12.653520584106445, 'learning_rate': 6.9630585189370556e-06}[Rank 1] Trainer log: {'loss': 0.7572, 'grad_norm': 12.653520584106445, 'learning_rate': 6.9630585189370556e-06}[Rank 0] Trainer log: {'loss': 0.7572, 'grad_norm': 12.653520584106445, 'learning_rate': 6.9630585189370556e-06} [Rank 2] Trainer log: {'loss': 0.7572, 'grad_norm': 12.653520584106445, 'learning_rate': 6.9630585189370556e-06} {'loss': 0.7572, 'grad_norm': 12.653520584106445, 'learning_rate': 6.9630585189370556e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.7352, 'grad_norm': 2.6997971534729004, 'learning_rate': 6.95656212569907e-06}[Rank 0] Trainer log: {'loss': 0.7352, 'grad_norm': 2.6997971534729004, 'learning_rate': 6.95656212569907e-06} [Rank 1] Trainer log: {'loss': 0.7352, 'grad_norm': 2.6997971534729004, 'learning_rate': 6.95656212569907e-06} [Rank 3] Trainer log: {'loss': 0.7352, 'grad_norm': 2.6997971534729004, 'learning_rate': 6.95656212569907e-06} {'loss': 0.7352, 'grad_norm': 2.6997971534729004, 'learning_rate': 6.95656212569907e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.8087, 'grad_norm': 4.146168231964111, 'learning_rate': 6.950067147693429e-06} [Rank 2] Trainer log: {'loss': 0.8087, 'grad_norm': 4.146168231964111, 'learning_rate': 6.950067147693429e-06}[Rank 0] Trainer log: {'loss': 0.8087, 'grad_norm': 4.146168231964111, 'learning_rate': 6.950067147693429e-06}[Rank 3] Trainer log: {'loss': 0.8087, 'grad_norm': 4.146168231964111, 'learning_rate': 6.950067147693429e-06} {'loss': 0.8087, 'grad_norm': 4.146168231964111, 'learning_rate': 6.950067147693429e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.9365, 'grad_norm': 8.402847290039062, 'learning_rate': 6.943573587940362e-06}[Rank 3] Trainer log: {'loss': 0.9365, 'grad_norm': 8.402847290039062, 'learning_rate': 6.943573587940362e-06}[Rank 0] Trainer log: {'loss': 0.9365, 'grad_norm': 8.402847290039062, 'learning_rate': 6.943573587940362e-06} [Rank 2] Trainer log: {'loss': 0.9365, 'grad_norm': 8.402847290039062, 'learning_rate': 6.943573587940362e-06} {'loss': 0.9365, 'grad_norm': 8.402847290039062, 'learning_rate': 6.943573587940362e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.8134, 'grad_norm': 2.189253330230713, 'learning_rate': 6.937081449459456e-06}[Rank 1] Trainer log: {'loss': 0.8134, 'grad_norm': 2.189253330230713, 'learning_rate': 6.937081449459456e-06}[Rank 0] Trainer log: {'loss': 0.8134, 'grad_norm': 2.189253330230713, 'learning_rate': 6.937081449459456e-06} [Rank 2] Trainer log: {'loss': 0.8134, 'grad_norm': 2.189253330230713, 'learning_rate': 6.937081449459456e-06} {'loss': 0.8134, 'grad_norm': 2.189253330230713, 'learning_rate': 6.937081449459456e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.4977, 'grad_norm': 4.5217156410217285, 'learning_rate': 6.930590735269621e-06}[Rank 1] Trainer log: {'loss': 0.4977, 'grad_norm': 4.5217156410217285, 'learning_rate': 6.930590735269621e-06}[Rank 3] Trainer log: {'loss': 0.4977, 'grad_norm': 4.5217156410217285, 'learning_rate': 6.930590735269621e-06} [Rank 0] Trainer log: {'loss': 0.4977, 'grad_norm': 4.5217156410217285, 'learning_rate': 6.930590735269621e-06} {'loss': 0.4977, 'grad_norm': 4.5217156410217285, 'learning_rate': 6.930590735269621e-06, 'epoch': 0.62} [Rank 0] Trainer log: {'loss': 1.11, 'grad_norm': 2.371661901473999, 'learning_rate': 6.924101448389115e-06}[Rank 3] Trainer log: {'loss': 1.11, 'grad_norm': 2.371661901473999, 'learning_rate': 6.924101448389115e-06} [Rank 1] Trainer log: {'loss': 1.11, 'grad_norm': 2.371661901473999, 'learning_rate': 6.924101448389115e-06} [Rank 2] Trainer log: {'loss': 1.11, 'grad_norm': 2.371661901473999, 'learning_rate': 6.924101448389115e-06} {'loss': 1.11, 'grad_norm': 2.371661901473999, 'learning_rate': 6.924101448389115e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.6573, 'grad_norm': 2.1686744689941406, 'learning_rate': 6.917613591835521e-06}[Rank 2] Trainer log: {'loss': 0.6573, 'grad_norm': 2.1686744689941406, 'learning_rate': 6.917613591835521e-06}[Rank 3] Trainer log: {'loss': 0.6573, 'grad_norm': 2.1686744689941406, 'learning_rate': 6.917613591835521e-06} [Rank 0] Trainer log: {'loss': 0.6573, 'grad_norm': 2.1686744689941406, 'learning_rate': 6.917613591835521e-06} {'loss': 0.6573, 'grad_norm': 2.1686744689941406, 'learning_rate': 6.917613591835521e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.7887, 'grad_norm': 3.710576295852661, 'learning_rate': 6.911127168625775e-06}[Rank 1] Trainer log: {'loss': 0.7887, 'grad_norm': 3.710576295852661, 'learning_rate': 6.911127168625775e-06} [Rank 0] Trainer log: {'loss': 0.7887, 'grad_norm': 3.710576295852661, 'learning_rate': 6.911127168625775e-06}[Rank 3] Trainer log: {'loss': 0.7887, 'grad_norm': 3.710576295852661, 'learning_rate': 6.911127168625775e-06} {'loss': 0.7887, 'grad_norm': 3.710576295852661, 'learning_rate': 6.911127168625775e-06, 'epoch': 0.62} [Rank 0] Trainer log: {'loss': 0.8386, 'grad_norm': 8.667436599731445, 'learning_rate': 6.9046421817761265e-06}[Rank 1] Trainer log: {'loss': 0.8386, 'grad_norm': 8.667436599731445, 'learning_rate': 6.9046421817761265e-06}[Rank 2] Trainer log: {'loss': 0.8386, 'grad_norm': 8.667436599731445, 'learning_rate': 6.9046421817761265e-06} [Rank 3] Trainer log: {'loss': 0.8386, 'grad_norm': 8.667436599731445, 'learning_rate': 6.9046421817761265e-06} {'loss': 0.8386, 'grad_norm': 8.667436599731445, 'learning_rate': 6.9046421817761265e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 1.0249, 'grad_norm': 2.6702418327331543, 'learning_rate': 6.898158634302168e-06}[Rank 3] Trainer log: {'loss': 1.0249, 'grad_norm': 2.6702418327331543, 'learning_rate': 6.898158634302168e-06}[Rank 0] Trainer log: {'loss': 1.0249, 'grad_norm': 2.6702418327331543, 'learning_rate': 6.898158634302168e-06} [Rank 1] Trainer log: {'loss': 1.0249, 'grad_norm': 2.6702418327331543, 'learning_rate': 6.898158634302168e-06} {'loss': 1.0249, 'grad_norm': 2.6702418327331543, 'learning_rate': 6.898158634302168e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 1.1181, 'grad_norm': 6.459060192108154, 'learning_rate': 6.891676529218827e-06}[Rank 1] Trainer log: {'loss': 1.1181, 'grad_norm': 6.459060192108154, 'learning_rate': 6.891676529218827e-06}[Rank 0] Trainer log: {'loss': 1.1181, 'grad_norm': 6.459060192108154, 'learning_rate': 6.891676529218827e-06} [Rank 2] Trainer log: {'loss': 1.1181, 'grad_norm': 6.459060192108154, 'learning_rate': 6.891676529218827e-06} {'loss': 1.1181, 'grad_norm': 6.459060192108154, 'learning_rate': 6.891676529218827e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 1.0507, 'grad_norm': 4.5943684577941895, 'learning_rate': 6.885195869540351e-06}[Rank 1] Trainer log: {'loss': 1.0507, 'grad_norm': 4.5943684577941895, 'learning_rate': 6.885195869540351e-06}[Rank 2] Trainer log: {'loss': 1.0507, 'grad_norm': 4.5943684577941895, 'learning_rate': 6.885195869540351e-06} [Rank 0] Trainer log: {'loss': 1.0507, 'grad_norm': 4.5943684577941895, 'learning_rate': 6.885195869540351e-06} {'loss': 1.0507, 'grad_norm': 4.5943684577941895, 'learning_rate': 6.885195869540351e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.9587, 'grad_norm': 15.67031192779541, 'learning_rate': 6.878716658280311e-06}[Rank 1] Trainer log: {'loss': 0.9587, 'grad_norm': 15.67031192779541, 'learning_rate': 6.878716658280311e-06}[Rank 3] Trainer log: {'loss': 0.9587, 'grad_norm': 15.67031192779541, 'learning_rate': 6.878716658280311e-06} [Rank 0] Trainer log: {'loss': 0.9587, 'grad_norm': 15.67031192779541, 'learning_rate': 6.878716658280311e-06} {'loss': 0.9587, 'grad_norm': 15.67031192779541, 'learning_rate': 6.878716658280311e-06, 'epoch': 0.62} [Rank 0] Trainer log: {'loss': 0.8279, 'grad_norm': 2.952714204788208, 'learning_rate': 6.872238898451623e-06}[Rank 3] Trainer log: {'loss': 0.8279, 'grad_norm': 2.952714204788208, 'learning_rate': 6.872238898451623e-06} [Rank 2] Trainer log: {'loss': 0.8279, 'grad_norm': 2.952714204788208, 'learning_rate': 6.872238898451623e-06}[Rank 1] Trainer log: {'loss': 0.8279, 'grad_norm': 2.952714204788208, 'learning_rate': 6.872238898451623e-06} {'loss': 0.8279, 'grad_norm': 2.952714204788208, 'learning_rate': 6.872238898451623e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.8711, 'grad_norm': 9.070573806762695, 'learning_rate': 6.865762593066514e-06}[Rank 0] Trainer log: {'loss': 0.8711, 'grad_norm': 9.070573806762695, 'learning_rate': 6.865762593066514e-06}[Rank 3] Trainer log: {'loss': 0.8711, 'grad_norm': 9.070573806762695, 'learning_rate': 6.865762593066514e-06} [Rank 1] Trainer log: {'loss': 0.8711, 'grad_norm': 9.070573806762695, 'learning_rate': 6.865762593066514e-06} {'loss': 0.8711, 'grad_norm': 9.070573806762695, 'learning_rate': 6.865762593066514e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.943, 'grad_norm': 3.964731216430664, 'learning_rate': 6.859287745136531e-06}[Rank 0] Trainer log: {'loss': 0.943, 'grad_norm': 3.964731216430664, 'learning_rate': 6.859287745136531e-06} [Rank 3] Trainer log: {'loss': 0.943, 'grad_norm': 3.964731216430664, 'learning_rate': 6.859287745136531e-06} [Rank 2] Trainer log: {'loss': 0.943, 'grad_norm': 3.964731216430664, 'learning_rate': 6.859287745136531e-06} {'loss': 0.943, 'grad_norm': 3.964731216430664, 'learning_rate': 6.859287745136531e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 1.0287, 'grad_norm': 2.4119625091552734, 'learning_rate': 6.852814357672561e-06}[Rank 3] Trainer log: {'loss': 1.0287, 'grad_norm': 2.4119625091552734, 'learning_rate': 6.852814357672561e-06} [Rank 0] Trainer log: {'loss': 1.0287, 'grad_norm': 2.4119625091552734, 'learning_rate': 6.852814357672561e-06} [Rank 2] Trainer log: {'loss': 1.0287, 'grad_norm': 2.4119625091552734, 'learning_rate': 6.852814357672561e-06} {'loss': 1.0287, 'grad_norm': 2.4119625091552734, 'learning_rate': 6.852814357672561e-06, 'epoch': 0.62} [Rank 0] Trainer log: {'loss': 0.9309, 'grad_norm': 4.7669358253479, 'learning_rate': 6.846342433684794e-06}[Rank 2] Trainer log: {'loss': 0.9309, 'grad_norm': 4.7669358253479, 'learning_rate': 6.846342433684794e-06}[Rank 1] Trainer log: {'loss': 0.9309, 'grad_norm': 4.7669358253479, 'learning_rate': 6.846342433684794e-06} [Rank 3] Trainer log: {'loss': 0.9309, 'grad_norm': 4.7669358253479, 'learning_rate': 6.846342433684794e-06} {'loss': 0.9309, 'grad_norm': 4.7669358253479, 'learning_rate': 6.846342433684794e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.8712, 'grad_norm': 8.53010082244873, 'learning_rate': 6.8398719761827435e-06}[Rank 1] Trainer log: {'loss': 0.8712, 'grad_norm': 8.53010082244873, 'learning_rate': 6.8398719761827435e-06}[Rank 0] Trainer log: {'loss': 0.8712, 'grad_norm': 8.53010082244873, 'learning_rate': 6.8398719761827435e-06} [Rank 2] Trainer log: {'loss': 0.8712, 'grad_norm': 8.53010082244873, 'learning_rate': 6.8398719761827435e-06} {'loss': 0.8712, 'grad_norm': 8.53010082244873, 'learning_rate': 6.8398719761827435e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.8007, 'grad_norm': 4.453543186187744, 'learning_rate': 6.8334029881752536e-06}[Rank 2] Trainer log: {'loss': 0.8007, 'grad_norm': 4.453543186187744, 'learning_rate': 6.8334029881752536e-06}[Rank 3] Trainer log: {'loss': 0.8007, 'grad_norm': 4.453543186187744, 'learning_rate': 6.8334029881752536e-06} [Rank 0] Trainer log: {'loss': 0.8007, 'grad_norm': 4.453543186187744, 'learning_rate': 6.8334029881752536e-06} {'loss': 0.8007, 'grad_norm': 4.453543186187744, 'learning_rate': 6.8334029881752536e-06, 'epoch': 0.62} [Rank 0] Trainer log: {'loss': 0.7953, 'grad_norm': 4.845802307128906, 'learning_rate': 6.82693547267047e-06}[Rank 1] Trainer log: {'loss': 0.7953, 'grad_norm': 4.845802307128906, 'learning_rate': 6.82693547267047e-06}[Rank 3] Trainer log: {'loss': 0.7953, 'grad_norm': 4.845802307128906, 'learning_rate': 6.82693547267047e-06} [Rank 2] Trainer log: {'loss': 0.7953, 'grad_norm': 4.845802307128906, 'learning_rate': 6.82693547267047e-06} {'loss': 0.7953, 'grad_norm': 4.845802307128906, 'learning_rate': 6.82693547267047e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.8303, 'grad_norm': 12.230664253234863, 'learning_rate': 6.8204694326758555e-06} [Rank 2] Trainer log: {'loss': 0.8303, 'grad_norm': 12.230664253234863, 'learning_rate': 6.8204694326758555e-06}[Rank 3] Trainer log: {'loss': 0.8303, 'grad_norm': 12.230664253234863, 'learning_rate': 6.8204694326758555e-06} [Rank 0] Trainer log: {'loss': 0.8303, 'grad_norm': 12.230664253234863, 'learning_rate': 6.8204694326758555e-06} {'loss': 0.8303, 'grad_norm': 12.230664253234863, 'learning_rate': 6.8204694326758555e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.8576, 'grad_norm': 4.349980354309082, 'learning_rate': 6.8140048711981954e-06}[Rank 1] Trainer log: {'loss': 0.8576, 'grad_norm': 4.349980354309082, 'learning_rate': 6.8140048711981954e-06} [Rank 2] Trainer log: {'loss': 0.8576, 'grad_norm': 4.349980354309082, 'learning_rate': 6.8140048711981954e-06} [Rank 0] Trainer log: {'loss': 0.8576, 'grad_norm': 4.349980354309082, 'learning_rate': 6.8140048711981954e-06} {'loss': 0.8576, 'grad_norm': 4.349980354309082, 'learning_rate': 6.8140048711981954e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 1.018, 'grad_norm': 6.748476982116699, 'learning_rate': 6.807541791243584e-06}[Rank 3] Trainer log: {'loss': 1.018, 'grad_norm': 6.748476982116699, 'learning_rate': 6.807541791243584e-06}[Rank 0] Trainer log: {'loss': 1.018, 'grad_norm': 6.748476982116699, 'learning_rate': 6.807541791243584e-06} [Rank 1] Trainer log: {'loss': 1.018, 'grad_norm': 6.748476982116699, 'learning_rate': 6.807541791243584e-06} {'loss': 1.018, 'grad_norm': 6.748476982116699, 'learning_rate': 6.807541791243584e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 0.8015, 'grad_norm': 5.6404500007629395, 'learning_rate': 6.801080195817417e-06}[Rank 2] Trainer log: {'loss': 0.8015, 'grad_norm': 5.6404500007629395, 'learning_rate': 6.801080195817417e-06} [Rank 3] Trainer log: {'loss': 0.8015, 'grad_norm': 5.6404500007629395, 'learning_rate': 6.801080195817417e-06} [Rank 0] Trainer log: {'loss': 0.8015, 'grad_norm': 5.6404500007629395, 'learning_rate': 6.801080195817417e-06} {'loss': 0.8015, 'grad_norm': 5.6404500007629395, 'learning_rate': 6.801080195817417e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.6475, 'grad_norm': 8.688678741455078, 'learning_rate': 6.794620087924418e-06}[Rank 0] Trainer log: {'loss': 0.6475, 'grad_norm': 8.688678741455078, 'learning_rate': 6.794620087924418e-06}[Rank 3] Trainer log: {'loss': 0.6475, 'grad_norm': 8.688678741455078, 'learning_rate': 6.794620087924418e-06} [Rank 1] Trainer log: {'loss': 0.6475, 'grad_norm': 8.688678741455078, 'learning_rate': 6.794620087924418e-06} {'loss': 0.6475, 'grad_norm': 8.688678741455078, 'learning_rate': 6.794620087924418e-06, 'epoch': 0.62} [Rank 1] Trainer log: {'loss': 1.0314, 'grad_norm': 4.58804988861084, 'learning_rate': 6.7881614705685995e-06}[Rank 2] Trainer log: {'loss': 1.0314, 'grad_norm': 4.58804988861084, 'learning_rate': 6.7881614705685995e-06} [Rank 0] Trainer log: {'loss': 1.0314, 'grad_norm': 4.58804988861084, 'learning_rate': 6.7881614705685995e-06} [Rank 3] Trainer log: {'loss': 1.0314, 'grad_norm': 4.58804988861084, 'learning_rate': 6.7881614705685995e-06} {'loss': 1.0314, 'grad_norm': 4.58804988861084, 'learning_rate': 6.7881614705685995e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.7484, 'grad_norm': 5.965458869934082, 'learning_rate': 6.781704346753295e-06}[Rank 1] Trainer log: {'loss': 0.7484, 'grad_norm': 5.965458869934082, 'learning_rate': 6.781704346753295e-06}[Rank 2] Trainer log: {'loss': 0.7484, 'grad_norm': 5.965458869934082, 'learning_rate': 6.781704346753295e-06} [Rank 0] Trainer log: {'loss': 0.7484, 'grad_norm': 5.965458869934082, 'learning_rate': 6.781704346753295e-06} {'loss': 0.7484, 'grad_norm': 5.965458869934082, 'learning_rate': 6.781704346753295e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 0.895, 'grad_norm': 6.741551399230957, 'learning_rate': 6.77524871948114e-06}[Rank 1] Trainer log: {'loss': 0.895, 'grad_norm': 6.741551399230957, 'learning_rate': 6.77524871948114e-06}[Rank 3] Trainer log: {'loss': 0.895, 'grad_norm': 6.741551399230957, 'learning_rate': 6.77524871948114e-06} [Rank 0] Trainer log: {'loss': 0.895, 'grad_norm': 6.741551399230957, 'learning_rate': 6.77524871948114e-06} {'loss': 0.895, 'grad_norm': 6.741551399230957, 'learning_rate': 6.77524871948114e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.8991, 'grad_norm': 2.63576340675354, 'learning_rate': 6.768794591754071e-06}[Rank 0] Trainer log: {'loss': 0.8991, 'grad_norm': 2.63576340675354, 'learning_rate': 6.768794591754071e-06} [Rank 1] Trainer log: {'loss': 0.8991, 'grad_norm': 2.63576340675354, 'learning_rate': 6.768794591754071e-06}[Rank 2] Trainer log: {'loss': 0.8991, 'grad_norm': 2.63576340675354, 'learning_rate': 6.768794591754071e-06} {'loss': 0.8991, 'grad_norm': 2.63576340675354, 'learning_rate': 6.768794591754071e-06, 'epoch': 0.62} [Rank 3] Trainer log: {'loss': 0.7032, 'grad_norm': 9.233600616455078, 'learning_rate': 6.762341966573327e-06}[Rank 2] Trainer log: {'loss': 0.7032, 'grad_norm': 9.233600616455078, 'learning_rate': 6.762341966573327e-06} [Rank 0] Trainer log: {'loss': 0.7032, 'grad_norm': 9.233600616455078, 'learning_rate': 6.762341966573327e-06} [Rank 1] Trainer log: {'loss': 0.7032, 'grad_norm': 9.233600616455078, 'learning_rate': 6.762341966573327e-06} {'loss': 0.7032, 'grad_norm': 9.233600616455078, 'learning_rate': 6.762341966573327e-06, 'epoch': 0.62} [Rank 2] Trainer log: {'loss': 1.0683, 'grad_norm': 2.6078670024871826, 'learning_rate': 6.755890846939454e-06}[Rank 1] Trainer log: {'loss': 1.0683, 'grad_norm': 2.6078670024871826, 'learning_rate': 6.755890846939454e-06}[Rank 3] Trainer log: {'loss': 1.0683, 'grad_norm': 2.6078670024871826, 'learning_rate': 6.755890846939454e-06} [Rank 0] Trainer log: {'loss': 1.0683, 'grad_norm': 2.6078670024871826, 'learning_rate': 6.755890846939454e-06} {'loss': 1.0683, 'grad_norm': 2.6078670024871826, 'learning_rate': 6.755890846939454e-06, 'epoch': 0.62} [Rank 0] Trainer log: {'loss': 1.0784, 'grad_norm': 3.4977593421936035, 'learning_rate': 6.749441235852292e-06}[Rank 2] Trainer log: {'loss': 1.0784, 'grad_norm': 3.4977593421936035, 'learning_rate': 6.749441235852292e-06}[Rank 1] Trainer log: {'loss': 1.0784, 'grad_norm': 3.4977593421936035, 'learning_rate': 6.749441235852292e-06} [Rank 3] Trainer log: {'loss': 1.0784, 'grad_norm': 3.4977593421936035, 'learning_rate': 6.749441235852292e-06} {'loss': 1.0784, 'grad_norm': 3.4977593421936035, 'learning_rate': 6.749441235852292e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 0.6998, 'grad_norm': 3.194459915161133, 'learning_rate': 6.7429931363109815e-06}[Rank 2] Trainer log: {'loss': 0.6998, 'grad_norm': 3.194459915161133, 'learning_rate': 6.7429931363109815e-06}[Rank 1] Trainer log: {'loss': 0.6998, 'grad_norm': 3.194459915161133, 'learning_rate': 6.7429931363109815e-06} [Rank 0] Trainer log: {'loss': 0.6998, 'grad_norm': 3.194459915161133, 'learning_rate': 6.7429931363109815e-06} {'loss': 0.6998, 'grad_norm': 3.194459915161133, 'learning_rate': 6.7429931363109815e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.777, 'grad_norm': 4.167316913604736, 'learning_rate': 6.736546551313957e-06}[Rank 1] Trainer log: {'loss': 0.777, 'grad_norm': 4.167316913604736, 'learning_rate': 6.736546551313957e-06}[Rank 3] Trainer log: {'loss': 0.777, 'grad_norm': 4.167316913604736, 'learning_rate': 6.736546551313957e-06} [Rank 0] Trainer log: {'loss': 0.777, 'grad_norm': 4.167316913604736, 'learning_rate': 6.736546551313957e-06} {'loss': 0.777, 'grad_norm': 4.167316913604736, 'learning_rate': 6.736546551313957e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.6606, 'grad_norm': 11.742955207824707, 'learning_rate': 6.73010148385896e-06}[Rank 2] Trainer log: {'loss': 0.6606, 'grad_norm': 11.742955207824707, 'learning_rate': 6.73010148385896e-06}[Rank 0] Trainer log: {'loss': 0.6606, 'grad_norm': 11.742955207824707, 'learning_rate': 6.73010148385896e-06} [Rank 3] Trainer log: {'loss': 0.6606, 'grad_norm': 11.742955207824707, 'learning_rate': 6.73010148385896e-06} {'loss': 0.6606, 'grad_norm': 11.742955207824707, 'learning_rate': 6.73010148385896e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.9697, 'grad_norm': 4.024625301361084, 'learning_rate': 6.723657936943014e-06}[Rank 3] Trainer log: {'loss': 0.9697, 'grad_norm': 4.024625301361084, 'learning_rate': 6.723657936943014e-06} [Rank 1] Trainer log: {'loss': 0.9697, 'grad_norm': 4.024625301361084, 'learning_rate': 6.723657936943014e-06} [Rank 0] Trainer log: {'loss': 0.9697, 'grad_norm': 4.024625301361084, 'learning_rate': 6.723657936943014e-06} {'loss': 0.9697, 'grad_norm': 4.024625301361084, 'learning_rate': 6.723657936943014e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7394, 'grad_norm': 2.4450864791870117, 'learning_rate': 6.717215913562436e-06}[Rank 1] Trainer log: {'loss': 0.7394, 'grad_norm': 2.4450864791870117, 'learning_rate': 6.717215913562436e-06} [Rank 3] Trainer log: {'loss': 0.7394, 'grad_norm': 2.4450864791870117, 'learning_rate': 6.717215913562436e-06} [Rank 0] Trainer log: {'loss': 0.7394, 'grad_norm': 2.4450864791870117, 'learning_rate': 6.717215913562436e-06} {'loss': 0.7394, 'grad_norm': 2.4450864791870117, 'learning_rate': 6.717215913562436e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.6538, 'grad_norm': 2.650116443634033, 'learning_rate': 6.7107754167128426e-06}[Rank 2] Trainer log: {'loss': 0.6538, 'grad_norm': 2.650116443634033, 'learning_rate': 6.7107754167128426e-06}[Rank 3] Trainer log: {'loss': 0.6538, 'grad_norm': 2.650116443634033, 'learning_rate': 6.7107754167128426e-06} [Rank 0] Trainer log: {'loss': 0.6538, 'grad_norm': 2.650116443634033, 'learning_rate': 6.7107754167128426e-06} {'loss': 0.6538, 'grad_norm': 2.650116443634033, 'learning_rate': 6.7107754167128426e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.9192, 'grad_norm': 10.823471069335938, 'learning_rate': 6.704336449389137e-06}[Rank 1] Trainer log: {'loss': 0.9192, 'grad_norm': 10.823471069335938, 'learning_rate': 6.704336449389137e-06}[Rank 3] Trainer log: {'loss': 0.9192, 'grad_norm': 10.823471069335938, 'learning_rate': 6.704336449389137e-06} [Rank 0] Trainer log: {'loss': 0.9192, 'grad_norm': 10.823471069335938, 'learning_rate': 6.704336449389137e-06} {'loss': 0.9192, 'grad_norm': 10.823471069335938, 'learning_rate': 6.704336449389137e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7286, 'grad_norm': 4.4822678565979, 'learning_rate': 6.697899014585507e-06}[Rank 0] Trainer log: {'loss': 0.7286, 'grad_norm': 4.4822678565979, 'learning_rate': 6.697899014585507e-06}[Rank 3] Trainer log: {'loss': 0.7286, 'grad_norm': 4.4822678565979, 'learning_rate': 6.697899014585507e-06} [Rank 1] Trainer log: {'loss': 0.7286, 'grad_norm': 4.4822678565979, 'learning_rate': 6.697899014585507e-06} {'loss': 0.7286, 'grad_norm': 4.4822678565979, 'learning_rate': 6.697899014585507e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 0.9077, 'grad_norm': 4.743864059448242, 'learning_rate': 6.691463115295435e-06} [Rank 0] Trainer log: {'loss': 0.9077, 'grad_norm': 4.743864059448242, 'learning_rate': 6.691463115295435e-06}[Rank 2] Trainer log: {'loss': 0.9077, 'grad_norm': 4.743864059448242, 'learning_rate': 6.691463115295435e-06} [Rank 1] Trainer log: {'loss': 0.9077, 'grad_norm': 4.743864059448242, 'learning_rate': 6.691463115295435e-06} {'loss': 0.9077, 'grad_norm': 4.743864059448242, 'learning_rate': 6.691463115295435e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.6992, 'grad_norm': 6.27946662902832, 'learning_rate': 6.685028754511682e-06}[Rank 3] Trainer log: {'loss': 0.6992, 'grad_norm': 6.27946662902832, 'learning_rate': 6.685028754511682e-06}[Rank 0] Trainer log: {'loss': 0.6992, 'grad_norm': 6.27946662902832, 'learning_rate': 6.685028754511682e-06} [Rank 1] Trainer log: {'loss': 0.6992, 'grad_norm': 6.27946662902832, 'learning_rate': 6.685028754511682e-06} {'loss': 0.6992, 'grad_norm': 6.27946662902832, 'learning_rate': 6.685028754511682e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 1.0256, 'grad_norm': 6.665810585021973, 'learning_rate': 6.678595935226296e-06}[Rank 2] Trainer log: {'loss': 1.0256, 'grad_norm': 6.665810585021973, 'learning_rate': 6.678595935226296e-06} [Rank 3] Trainer log: {'loss': 1.0256, 'grad_norm': 6.665810585021973, 'learning_rate': 6.678595935226296e-06} [Rank 0] Trainer log: {'loss': 1.0256, 'grad_norm': 6.665810585021973, 'learning_rate': 6.678595935226296e-06} {'loss': 1.0256, 'grad_norm': 6.665810585021973, 'learning_rate': 6.678595935226296e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7546, 'grad_norm': 3.8816606998443604, 'learning_rate': 6.672164660430616e-06}[Rank 3] Trainer log: {'loss': 0.7546, 'grad_norm': 3.8816606998443604, 'learning_rate': 6.672164660430616e-06}[Rank 1] Trainer log: {'loss': 0.7546, 'grad_norm': 3.8816606998443604, 'learning_rate': 6.672164660430616e-06} [Rank 0] Trainer log: {'loss': 0.7546, 'grad_norm': 3.8816606998443604, 'learning_rate': 6.672164660430616e-06} {'loss': 0.7546, 'grad_norm': 3.8816606998443604, 'learning_rate': 6.672164660430616e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.6531, 'grad_norm': 10.988195419311523, 'learning_rate': 6.665734933115252e-06}[Rank 2] Trainer log: {'loss': 0.6531, 'grad_norm': 10.988195419311523, 'learning_rate': 6.665734933115252e-06}[Rank 0] Trainer log: {'loss': 0.6531, 'grad_norm': 10.988195419311523, 'learning_rate': 6.665734933115252e-06} [Rank 3] Trainer log: {'loss': 0.6531, 'grad_norm': 10.988195419311523, 'learning_rate': 6.665734933115252e-06} {'loss': 0.6531, 'grad_norm': 10.988195419311523, 'learning_rate': 6.665734933115252e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 0.7799, 'grad_norm': 14.149457931518555, 'learning_rate': 6.6593067562700945e-06}[Rank 1] Trainer log: {'loss': 0.7799, 'grad_norm': 14.149457931518555, 'learning_rate': 6.6593067562700945e-06}[Rank 2] Trainer log: {'loss': 0.7799, 'grad_norm': 14.149457931518555, 'learning_rate': 6.6593067562700945e-06} [Rank 0] Trainer log: {'loss': 0.7799, 'grad_norm': 14.149457931518555, 'learning_rate': 6.6593067562700945e-06} {'loss': 0.7799, 'grad_norm': 14.149457931518555, 'learning_rate': 6.6593067562700945e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.8064, 'grad_norm': 10.792061805725098, 'learning_rate': 6.652880132884322e-06}[Rank 1] Trainer log: {'loss': 0.8064, 'grad_norm': 10.792061805725098, 'learning_rate': 6.652880132884322e-06} [Rank 3] Trainer log: {'loss': 0.8064, 'grad_norm': 10.792061805725098, 'learning_rate': 6.652880132884322e-06} [Rank 0] Trainer log: {'loss': 0.8064, 'grad_norm': 10.792061805725098, 'learning_rate': 6.652880132884322e-06} {'loss': 0.8064, 'grad_norm': 10.792061805725098, 'learning_rate': 6.652880132884322e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7955, 'grad_norm': 4.819939136505127, 'learning_rate': 6.646455065946386e-06}[Rank 3] Trainer log: {'loss': 0.7955, 'grad_norm': 4.819939136505127, 'learning_rate': 6.646455065946386e-06}[Rank 1] Trainer log: {'loss': 0.7955, 'grad_norm': 4.819939136505127, 'learning_rate': 6.646455065946386e-06} [Rank 0] Trainer log: {'loss': 0.7955, 'grad_norm': 4.819939136505127, 'learning_rate': 6.646455065946386e-06} {'loss': 0.7955, 'grad_norm': 4.819939136505127, 'learning_rate': 6.646455065946386e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.8018, 'grad_norm': 6.197938442230225, 'learning_rate': 6.640031558444009e-06}[Rank 3] Trainer log: {'loss': 0.8018, 'grad_norm': 6.197938442230225, 'learning_rate': 6.640031558444009e-06}[Rank 1] Trainer log: {'loss': 0.8018, 'grad_norm': 6.197938442230225, 'learning_rate': 6.640031558444009e-06} [Rank 0] Trainer log: {'loss': 0.8018, 'grad_norm': 6.197938442230225, 'learning_rate': 6.640031558444009e-06} {'loss': 0.8018, 'grad_norm': 6.197938442230225, 'learning_rate': 6.640031558444009e-06, 'epoch': 0.63} [Rank 0] Trainer log: {'loss': 0.9133, 'grad_norm': 4.241856575012207, 'learning_rate': 6.6336096133642e-06}[Rank 1] Trainer log: {'loss': 0.9133, 'grad_norm': 4.241856575012207, 'learning_rate': 6.6336096133642e-06}[Rank 3] Trainer log: {'loss': 0.9133, 'grad_norm': 4.241856575012207, 'learning_rate': 6.6336096133642e-06} [Rank 2] Trainer log: {'loss': 0.9133, 'grad_norm': 4.241856575012207, 'learning_rate': 6.6336096133642e-06} {'loss': 0.9133, 'grad_norm': 4.241856575012207, 'learning_rate': 6.6336096133642e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7931, 'grad_norm': 5.482886791229248, 'learning_rate': 6.62718923369323e-06}[Rank 3] Trainer log: {'loss': 0.7931, 'grad_norm': 5.482886791229248, 'learning_rate': 6.62718923369323e-06} [Rank 1] Trainer log: {'loss': 0.7931, 'grad_norm': 5.482886791229248, 'learning_rate': 6.62718923369323e-06} [Rank 0] Trainer log: {'loss': 0.7931, 'grad_norm': 5.482886791229248, 'learning_rate': 6.62718923369323e-06} {'loss': 0.7931, 'grad_norm': 5.482886791229248, 'learning_rate': 6.62718923369323e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.8164, 'grad_norm': 3.176365613937378, 'learning_rate': 6.620770422416644e-06}[Rank 3] Trainer log: {'loss': 0.8164, 'grad_norm': 3.176365613937378, 'learning_rate': 6.620770422416644e-06}[Rank 1] Trainer log: {'loss': 0.8164, 'grad_norm': 3.176365613937378, 'learning_rate': 6.620770422416644e-06} [Rank 0] Trainer log: {'loss': 0.8164, 'grad_norm': 3.176365613937378, 'learning_rate': 6.620770422416644e-06} {'loss': 0.8164, 'grad_norm': 3.176365613937378, 'learning_rate': 6.620770422416644e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 0.9742, 'grad_norm': 5.690133094787598, 'learning_rate': 6.614353182519269e-06} [Rank 2] Trainer log: {'loss': 0.9742, 'grad_norm': 5.690133094787598, 'learning_rate': 6.614353182519269e-06} [Rank 1] Trainer log: {'loss': 0.9742, 'grad_norm': 5.690133094787598, 'learning_rate': 6.614353182519269e-06} [Rank 0] Trainer log: {'loss': 0.9742, 'grad_norm': 5.690133094787598, 'learning_rate': 6.614353182519269e-06} {'loss': 0.9742, 'grad_norm': 5.690133094787598, 'learning_rate': 6.614353182519269e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 1.0078, 'grad_norm': 6.193117141723633, 'learning_rate': 6.607937516985187e-06}[Rank 2] Trainer log: {'loss': 1.0078, 'grad_norm': 6.193117141723633, 'learning_rate': 6.607937516985187e-06} [Rank 1] Trainer log: {'loss': 1.0078, 'grad_norm': 6.193117141723633, 'learning_rate': 6.607937516985187e-06} [Rank 0] Trainer log: {'loss': 1.0078, 'grad_norm': 6.193117141723633, 'learning_rate': 6.607937516985187e-06} {'loss': 1.0078, 'grad_norm': 6.193117141723633, 'learning_rate': 6.607937516985187e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7092, 'grad_norm': 5.6458635330200195, 'learning_rate': 6.601523428797752e-06}[Rank 1] Trainer log: {'loss': 0.7092, 'grad_norm': 5.6458635330200195, 'learning_rate': 6.601523428797752e-06} [Rank 3] Trainer log: {'loss': 0.7092, 'grad_norm': 5.6458635330200195, 'learning_rate': 6.601523428797752e-06} [Rank 0] Trainer log: {'loss': 0.7092, 'grad_norm': 5.6458635330200195, 'learning_rate': 6.601523428797752e-06} {'loss': 0.7092, 'grad_norm': 5.6458635330200195, 'learning_rate': 6.601523428797752e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.9594, 'grad_norm': 4.853696823120117, 'learning_rate': 6.5951109209395895e-06}[Rank 3] Trainer log: {'loss': 0.9594, 'grad_norm': 4.853696823120117, 'learning_rate': 6.5951109209395895e-06} [Rank 1] Trainer log: {'loss': 0.9594, 'grad_norm': 4.853696823120117, 'learning_rate': 6.5951109209395895e-06} [Rank 0] Trainer log: {'loss': 0.9594, 'grad_norm': 4.853696823120117, 'learning_rate': 6.5951109209395895e-06} {'loss': 0.9594, 'grad_norm': 4.853696823120117, 'learning_rate': 6.5951109209395895e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.8871, 'grad_norm': 9.1314115524292, 'learning_rate': 6.588699996392587e-06}[Rank 3] Trainer log: {'loss': 0.8871, 'grad_norm': 9.1314115524292, 'learning_rate': 6.588699996392587e-06} [Rank 1] Trainer log: {'loss': 0.8871, 'grad_norm': 9.1314115524292, 'learning_rate': 6.588699996392587e-06} [Rank 0] Trainer log: {'loss': 0.8871, 'grad_norm': 9.1314115524292, 'learning_rate': 6.588699996392587e-06} {'loss': 0.8871, 'grad_norm': 9.1314115524292, 'learning_rate': 6.588699996392587e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 0.9664, 'grad_norm': 8.45080280303955, 'learning_rate': 6.5822906581378955e-06}[Rank 1] Trainer log: {'loss': 0.9664, 'grad_norm': 8.45080280303955, 'learning_rate': 6.5822906581378955e-06} [Rank 2] Trainer log: {'loss': 0.9664, 'grad_norm': 8.45080280303955, 'learning_rate': 6.5822906581378955e-06} [Rank 0] Trainer log: {'loss': 0.9664, 'grad_norm': 8.45080280303955, 'learning_rate': 6.5822906581378955e-06} {'loss': 0.9664, 'grad_norm': 8.45080280303955, 'learning_rate': 6.5822906581378955e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.6539, 'grad_norm': 7.58603048324585, 'learning_rate': 6.5758829091559216e-06}[Rank 0] Trainer log: {'loss': 0.6539, 'grad_norm': 7.58603048324585, 'learning_rate': 6.5758829091559216e-06}[Rank 2] Trainer log: {'loss': 0.6539, 'grad_norm': 7.58603048324585, 'learning_rate': 6.5758829091559216e-06} [Rank 3] Trainer log: {'loss': 0.6539, 'grad_norm': 7.58603048324585, 'learning_rate': 6.5758829091559216e-06} {'loss': 0.6539, 'grad_norm': 7.58603048324585, 'learning_rate': 6.5758829091559216e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 0.9381, 'grad_norm': 2.2179763317108154, 'learning_rate': 6.569476752426347e-06}[Rank 1] Trainer log: {'loss': 0.9381, 'grad_norm': 2.2179763317108154, 'learning_rate': 6.569476752426347e-06} [Rank 2] Trainer log: {'loss': 0.9381, 'grad_norm': 2.2179763317108154, 'learning_rate': 6.569476752426347e-06} [Rank 0] Trainer log: {'loss': 0.9381, 'grad_norm': 2.2179763317108154, 'learning_rate': 6.569476752426347e-06} {'loss': 0.9381, 'grad_norm': 2.2179763317108154, 'learning_rate': 6.569476752426347e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 0.7266, 'grad_norm': 2.883723497390747, 'learning_rate': 6.563072190928106e-06}[Rank 2] Trainer log: {'loss': 0.7266, 'grad_norm': 2.883723497390747, 'learning_rate': 6.563072190928106e-06} [Rank 0] Trainer log: {'loss': 0.7266, 'grad_norm': 2.883723497390747, 'learning_rate': 6.563072190928106e-06}[Rank 1] Trainer log: {'loss': 0.7266, 'grad_norm': 2.883723497390747, 'learning_rate': 6.563072190928106e-06} {'loss': 0.7266, 'grad_norm': 2.883723497390747, 'learning_rate': 6.563072190928106e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7248, 'grad_norm': 3.7733912467956543, 'learning_rate': 6.556669227639384e-06}[Rank 3] Trainer log: {'loss': 0.7248, 'grad_norm': 3.7733912467956543, 'learning_rate': 6.556669227639384e-06} [Rank 1] Trainer log: {'loss': 0.7248, 'grad_norm': 3.7733912467956543, 'learning_rate': 6.556669227639384e-06} [Rank 0] Trainer log: {'loss': 0.7248, 'grad_norm': 3.7733912467956543, 'learning_rate': 6.556669227639384e-06} {'loss': 0.7248, 'grad_norm': 3.7733912467956543, 'learning_rate': 6.556669227639384e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7599, 'grad_norm': 2.6240172386169434, 'learning_rate': 6.5502678655376385e-06} [Rank 1] Trainer log: {'loss': 0.7599, 'grad_norm': 2.6240172386169434, 'learning_rate': 6.5502678655376385e-06} [Rank 0] Trainer log: {'loss': 0.7599, 'grad_norm': 2.6240172386169434, 'learning_rate': 6.5502678655376385e-06} {'loss': 0.7599, 'grad_norm': 2.6240172386169434, 'learning_rate': 6.5502678655376385e-06, 'epoch': 0.63} [Rank 3] Trainer log: {'loss': 0.7599, 'grad_norm': 2.6240172386169434, 'learning_rate': 6.5502678655376385e-06} [Rank 1] Trainer log: {'loss': 0.6987, 'grad_norm': 13.23658275604248, 'learning_rate': 6.543868107599567e-06}[Rank 2] Trainer log: {'loss': 0.6987, 'grad_norm': 13.23658275604248, 'learning_rate': 6.543868107599567e-06}[Rank 0] Trainer log: {'loss': 0.6987, 'grad_norm': 13.23658275604248, 'learning_rate': 6.543868107599567e-06} [Rank 3] Trainer log: {'loss': 0.6987, 'grad_norm': 13.23658275604248, 'learning_rate': 6.543868107599567e-06} {'loss': 0.6987, 'grad_norm': 13.23658275604248, 'learning_rate': 6.543868107599567e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.5154, 'grad_norm': 6.475583076477051, 'learning_rate': 6.537469956801128e-06} [Rank 1] Trainer log: {'loss': 0.5154, 'grad_norm': 6.475583076477051, 'learning_rate': 6.537469956801128e-06}[Rank 0] Trainer log: {'loss': 0.5154, 'grad_norm': 6.475583076477051, 'learning_rate': 6.537469956801128e-06} [Rank 3] Trainer log: {'loss': 0.5154, 'grad_norm': 6.475583076477051, 'learning_rate': 6.537469956801128e-06} {'loss': 0.5154, 'grad_norm': 6.475583076477051, 'learning_rate': 6.537469956801128e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.6528, 'grad_norm': 3.376832962036133, 'learning_rate': 6.5310734161175395e-06} [Rank 2] Trainer log: {'loss': 0.6528, 'grad_norm': 3.376832962036133, 'learning_rate': 6.5310734161175395e-06} [Rank 3] Trainer log: {'loss': 0.6528, 'grad_norm': 3.376832962036133, 'learning_rate': 6.5310734161175395e-06} [Rank 0] Trainer log: {'loss': 0.6528, 'grad_norm': 3.376832962036133, 'learning_rate': 6.5310734161175395e-06} {'loss': 0.6528, 'grad_norm': 3.376832962036133, 'learning_rate': 6.5310734161175395e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.5837, 'grad_norm': 6.090897560119629, 'learning_rate': 6.524678488523258e-06}[Rank 0] Trainer log: {'loss': 0.5837, 'grad_norm': 6.090897560119629, 'learning_rate': 6.524678488523258e-06}[Rank 2] Trainer log: {'loss': 0.5837, 'grad_norm': 6.090897560119629, 'learning_rate': 6.524678488523258e-06} [Rank 3] Trainer log: {'loss': 0.5837, 'grad_norm': 6.090897560119629, 'learning_rate': 6.524678488523258e-06} {'loss': 0.5837, 'grad_norm': 6.090897560119629, 'learning_rate': 6.524678488523258e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.56, 'grad_norm': 11.847796440124512, 'learning_rate': 6.5182851769919945e-06} [Rank 3] Trainer log: {'loss': 0.56, 'grad_norm': 11.847796440124512, 'learning_rate': 6.5182851769919945e-06}[Rank 1] Trainer log: {'loss': 0.56, 'grad_norm': 11.847796440124512, 'learning_rate': 6.5182851769919945e-06} [Rank 0] Trainer log: {'loss': 0.56, 'grad_norm': 11.847796440124512, 'learning_rate': 6.5182851769919945e-06} {'loss': 0.56, 'grad_norm': 11.847796440124512, 'learning_rate': 6.5182851769919945e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.5879, 'grad_norm': 13.593832969665527, 'learning_rate': 6.511893484496712e-06}[Rank 1] Trainer log: {'loss': 0.5879, 'grad_norm': 13.593832969665527, 'learning_rate': 6.511893484496712e-06} [Rank 0] Trainer log: {'loss': 0.5879, 'grad_norm': 13.593832969665527, 'learning_rate': 6.511893484496712e-06}[Rank 3] Trainer log: {'loss': 0.5879, 'grad_norm': 13.593832969665527, 'learning_rate': 6.511893484496712e-06} {'loss': 0.5879, 'grad_norm': 13.593832969665527, 'learning_rate': 6.511893484496712e-06, 'epoch': 0.63} [Rank 0] Trainer log: {'loss': 0.6496, 'grad_norm': 9.633304595947266, 'learning_rate': 6.505503414009622e-06}[Rank 1] Trainer log: {'loss': 0.6496, 'grad_norm': 9.633304595947266, 'learning_rate': 6.505503414009622e-06}[Rank 2] Trainer log: {'loss': 0.6496, 'grad_norm': 9.633304595947266, 'learning_rate': 6.505503414009622e-06} [Rank 3] Trainer log: {'loss': 0.6496, 'grad_norm': 9.633304595947266, 'learning_rate': 6.505503414009622e-06} {'loss': 0.6496, 'grad_norm': 9.633304595947266, 'learning_rate': 6.505503414009622e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.861, 'grad_norm': 4.823824882507324, 'learning_rate': 6.49911496850217e-06} [Rank 0] Trainer log: {'loss': 0.861, 'grad_norm': 4.823824882507324, 'learning_rate': 6.49911496850217e-06} [Rank 2] Trainer log: {'loss': 0.861, 'grad_norm': 4.823824882507324, 'learning_rate': 6.49911496850217e-06} [Rank 3] Trainer log: {'loss': 0.861, 'grad_norm': 4.823824882507324, 'learning_rate': 6.49911496850217e-06} {'loss': 0.861, 'grad_norm': 4.823824882507324, 'learning_rate': 6.49911496850217e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.8373, 'grad_norm': 3.1221868991851807, 'learning_rate': 6.492728150945062e-06}[Rank 3] Trainer log: {'loss': 0.8373, 'grad_norm': 3.1221868991851807, 'learning_rate': 6.492728150945062e-06}[Rank 2] Trainer log: {'loss': 0.8373, 'grad_norm': 3.1221868991851807, 'learning_rate': 6.492728150945062e-06} [Rank 0] Trainer log: {'loss': 0.8373, 'grad_norm': 3.1221868991851807, 'learning_rate': 6.492728150945062e-06} {'loss': 0.8373, 'grad_norm': 3.1221868991851807, 'learning_rate': 6.492728150945062e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.8177, 'grad_norm': 13.034401893615723, 'learning_rate': 6.486342964308234e-06} [Rank 1] Trainer log: {'loss': 0.8177, 'grad_norm': 13.034401893615723, 'learning_rate': 6.486342964308234e-06} [Rank 3] Trainer log: {'loss': 0.8177, 'grad_norm': 13.034401893615723, 'learning_rate': 6.486342964308234e-06}[Rank 0] Trainer log: {'loss': 0.8177, 'grad_norm': 13.034401893615723, 'learning_rate': 6.486342964308234e-06} {'loss': 0.8177, 'grad_norm': 13.034401893615723, 'learning_rate': 6.486342964308234e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.6377, 'grad_norm': 15.017735481262207, 'learning_rate': 6.479959411560868e-06}[Rank 1] Trainer log: {'loss': 0.6377, 'grad_norm': 15.017735481262207, 'learning_rate': 6.479959411560868e-06}[Rank 3] Trainer log: {'loss': 0.6377, 'grad_norm': 15.017735481262207, 'learning_rate': 6.479959411560868e-06} [Rank 0] Trainer log: {'loss': 0.6377, 'grad_norm': 15.017735481262207, 'learning_rate': 6.479959411560868e-06} {'loss': 0.6377, 'grad_norm': 15.017735481262207, 'learning_rate': 6.479959411560868e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.7767, 'grad_norm': 2.444547653198242, 'learning_rate': 6.4735774956713926e-06}[Rank 2] Trainer log: {'loss': 0.7767, 'grad_norm': 2.444547653198242, 'learning_rate': 6.4735774956713926e-06}[Rank 3] Trainer log: {'loss': 0.7767, 'grad_norm': 2.444547653198242, 'learning_rate': 6.4735774956713926e-06} [Rank 0] Trainer log: {'loss': 0.7767, 'grad_norm': 2.444547653198242, 'learning_rate': 6.4735774956713926e-06} {'loss': 0.7767, 'grad_norm': 2.444547653198242, 'learning_rate': 6.4735774956713926e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.798, 'grad_norm': 5.945978164672852, 'learning_rate': 6.467197219607465e-06}[Rank 2] Trainer log: {'loss': 0.798, 'grad_norm': 5.945978164672852, 'learning_rate': 6.467197219607465e-06}[Rank 0] Trainer log: {'loss': 0.798, 'grad_norm': 5.945978164672852, 'learning_rate': 6.467197219607465e-06} [Rank 3] Trainer log: {'loss': 0.798, 'grad_norm': 5.945978164672852, 'learning_rate': 6.467197219607465e-06} {'loss': 0.798, 'grad_norm': 5.945978164672852, 'learning_rate': 6.467197219607465e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.693, 'grad_norm': 6.45718240737915, 'learning_rate': 6.460818586335982e-06}[Rank 2] Trainer log: {'loss': 0.693, 'grad_norm': 6.45718240737915, 'learning_rate': 6.460818586335982e-06} [Rank 0] Trainer log: {'loss': 0.693, 'grad_norm': 6.45718240737915, 'learning_rate': 6.460818586335982e-06}[Rank 3] Trainer log: {'loss': 0.693, 'grad_norm': 6.45718240737915, 'learning_rate': 6.460818586335982e-06} {'loss': 0.693, 'grad_norm': 6.45718240737915, 'learning_rate': 6.460818586335982e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.8793, 'grad_norm': 6.071304798126221, 'learning_rate': 6.454441598823082e-06} [Rank 1] Trainer log: {'loss': 0.8793, 'grad_norm': 6.071304798126221, 'learning_rate': 6.454441598823082e-06} [Rank 3] Trainer log: {'loss': 0.8793, 'grad_norm': 6.071304798126221, 'learning_rate': 6.454441598823082e-06} [Rank 0] Trainer log: {'loss': 0.8793, 'grad_norm': 6.071304798126221, 'learning_rate': 6.454441598823082e-06} {'loss': 0.8793, 'grad_norm': 6.071304798126221, 'learning_rate': 6.454441598823082e-06, 'epoch': 0.63} [Rank 1] Trainer log: {'loss': 0.8612, 'grad_norm': 4.418327331542969, 'learning_rate': 6.448066260034138e-06}[Rank 2] Trainer log: {'loss': 0.8612, 'grad_norm': 4.418327331542969, 'learning_rate': 6.448066260034138e-06} [Rank 0] Trainer log: {'loss': 0.8612, 'grad_norm': 4.418327331542969, 'learning_rate': 6.448066260034138e-06} [Rank 3] Trainer log: {'loss': 0.8612, 'grad_norm': 4.418327331542969, 'learning_rate': 6.448066260034138e-06} {'loss': 0.8612, 'grad_norm': 4.418327331542969, 'learning_rate': 6.448066260034138e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.7817, 'grad_norm': 5.669523239135742, 'learning_rate': 6.441692572933746e-06} [Rank 1] Trainer log: {'loss': 0.7817, 'grad_norm': 5.669523239135742, 'learning_rate': 6.441692572933746e-06} [Rank 3] Trainer log: {'loss': 0.7817, 'grad_norm': 5.669523239135742, 'learning_rate': 6.441692572933746e-06} [Rank 0] Trainer log: {'loss': 0.7817, 'grad_norm': 5.669523239135742, 'learning_rate': 6.441692572933746e-06} {'loss': 0.7817, 'grad_norm': 5.669523239135742, 'learning_rate': 6.441692572933746e-06, 'epoch': 0.63} [Rank 2] Trainer log: {'loss': 0.8431, 'grad_norm': 4.098546028137207, 'learning_rate': 6.4353205404857476e-06}[Rank 1] Trainer log: {'loss': 0.8431, 'grad_norm': 4.098546028137207, 'learning_rate': 6.4353205404857476e-06} [Rank 0] Trainer log: {'loss': 0.8431, 'grad_norm': 4.098546028137207, 'learning_rate': 6.4353205404857476e-06}[Rank 3] Trainer log: {'loss': 0.8431, 'grad_norm': 4.098546028137207, 'learning_rate': 6.4353205404857476e-06} {'loss': 0.8431, 'grad_norm': 4.098546028137207, 'learning_rate': 6.4353205404857476e-06, 'epoch': 0.64} [Rank 3] Trainer log: {'loss': 0.863, 'grad_norm': 2.7435896396636963, 'learning_rate': 6.428950165653204e-06} [Rank 2] Trainer log: {'loss': 0.863, 'grad_norm': 2.7435896396636963, 'learning_rate': 6.428950165653204e-06}[Rank 0] Trainer log: {'loss': 0.863, 'grad_norm': 2.7435896396636963, 'learning_rate': 6.428950165653204e-06} [Rank 1] Trainer log: {'loss': 0.863, 'grad_norm': 2.7435896396636963, 'learning_rate': 6.428950165653204e-06}{'loss': 0.863, 'grad_norm': 2.7435896396636963, 'learning_rate': 6.428950165653204e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8229, 'grad_norm': 5.795838832855225, 'learning_rate': 6.4225814513984094e-06}[Rank 1] Trainer log: {'loss': 0.8229, 'grad_norm': 5.795838832855225, 'learning_rate': 6.4225814513984094e-06}[Rank 3] Trainer log: {'loss': 0.8229, 'grad_norm': 5.795838832855225, 'learning_rate': 6.4225814513984094e-06} [Rank 0] Trainer log: {'loss': 0.8229, 'grad_norm': 5.795838832855225, 'learning_rate': 6.4225814513984094e-06} {'loss': 0.8229, 'grad_norm': 5.795838832855225, 'learning_rate': 6.4225814513984094e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.6078, 'grad_norm': 3.680074691772461, 'learning_rate': 6.416214400682894e-06}[Rank 3] Trainer log: {'loss': 0.6078, 'grad_norm': 3.680074691772461, 'learning_rate': 6.416214400682894e-06}[Rank 0] Trainer log: {'loss': 0.6078, 'grad_norm': 3.680074691772461, 'learning_rate': 6.416214400682894e-06} [Rank 2] Trainer log: {'loss': 0.6078, 'grad_norm': 3.680074691772461, 'learning_rate': 6.416214400682894e-06} {'loss': 0.6078, 'grad_norm': 3.680074691772461, 'learning_rate': 6.416214400682894e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.798, 'grad_norm': 5.004809856414795, 'learning_rate': 6.409849016467401e-06} [Rank 3] Trainer log: {'loss': 0.798, 'grad_norm': 5.004809856414795, 'learning_rate': 6.409849016467401e-06}[Rank 0] Trainer log: {'loss': 0.798, 'grad_norm': 5.004809856414795, 'learning_rate': 6.409849016467401e-06} [Rank 2] Trainer log: {'loss': 0.798, 'grad_norm': 5.004809856414795, 'learning_rate': 6.409849016467401e-06} {'loss': 0.798, 'grad_norm': 5.004809856414795, 'learning_rate': 6.409849016467401e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8515, 'grad_norm': 6.764213562011719, 'learning_rate': 6.403485301711905e-06}[Rank 3] Trainer log: {'loss': 0.8515, 'grad_norm': 6.764213562011719, 'learning_rate': 6.403485301711905e-06}[Rank 2] Trainer log: {'loss': 0.8515, 'grad_norm': 6.764213562011719, 'learning_rate': 6.403485301711905e-06} [Rank 0] Trainer log: {'loss': 0.8515, 'grad_norm': 6.764213562011719, 'learning_rate': 6.403485301711905e-06} {'loss': 0.8515, 'grad_norm': 6.764213562011719, 'learning_rate': 6.403485301711905e-06, 'epoch': 0.64} [Rank 3] Trainer log: {'loss': 0.8189, 'grad_norm': 2.7530250549316406, 'learning_rate': 6.3971232593756e-06}[Rank 2] Trainer log: {'loss': 0.8189, 'grad_norm': 2.7530250549316406, 'learning_rate': 6.3971232593756e-06}[Rank 1] Trainer log: {'loss': 0.8189, 'grad_norm': 2.7530250549316406, 'learning_rate': 6.3971232593756e-06} [Rank 0] Trainer log: {'loss': 0.8189, 'grad_norm': 2.7530250549316406, 'learning_rate': 6.3971232593756e-06} {'loss': 0.8189, 'grad_norm': 2.7530250549316406, 'learning_rate': 6.3971232593756e-06, 'epoch': 0.64} [Rank 3] Trainer log: {'loss': 0.7827, 'grad_norm': 2.7327821254730225, 'learning_rate': 6.390762892416916e-06}[Rank 2] Trainer log: {'loss': 0.7827, 'grad_norm': 2.7327821254730225, 'learning_rate': 6.390762892416916e-06}[Rank 1] Trainer log: {'loss': 0.7827, 'grad_norm': 2.7327821254730225, 'learning_rate': 6.390762892416916e-06} [Rank 0] Trainer log: {'loss': 0.7827, 'grad_norm': 2.7327821254730225, 'learning_rate': 6.390762892416916e-06} {'loss': 0.7827, 'grad_norm': 2.7327821254730225, 'learning_rate': 6.390762892416916e-06, 'epoch': 0.64} [Rank 3] Trainer log: {'loss': 0.7297, 'grad_norm': 3.4569342136383057, 'learning_rate': 6.384404203793488e-06}[Rank 1] Trainer log: {'loss': 0.7297, 'grad_norm': 3.4569342136383057, 'learning_rate': 6.384404203793488e-06}[Rank 2] Trainer log: {'loss': 0.7297, 'grad_norm': 3.4569342136383057, 'learning_rate': 6.384404203793488e-06} [Rank 0] Trainer log: {'loss': 0.7297, 'grad_norm': 3.4569342136383057, 'learning_rate': 6.384404203793488e-06} {'loss': 0.7297, 'grad_norm': 3.4569342136383057, 'learning_rate': 6.384404203793488e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.72, 'grad_norm': 2.719825267791748, 'learning_rate': 6.3780471964621745e-06}[Rank 3] Trainer log: {'loss': 0.72, 'grad_norm': 2.719825267791748, 'learning_rate': 6.3780471964621745e-06}[Rank 0] Trainer log: {'loss': 0.72, 'grad_norm': 2.719825267791748, 'learning_rate': 6.3780471964621745e-06} [Rank 2] Trainer log: {'loss': 0.72, 'grad_norm': 2.719825267791748, 'learning_rate': 6.3780471964621745e-06} {'loss': 0.72, 'grad_norm': 2.719825267791748, 'learning_rate': 6.3780471964621745e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8302, 'grad_norm': 3.3142402172088623, 'learning_rate': 6.371691873379059e-06}[Rank 3] Trainer log: {'loss': 0.8302, 'grad_norm': 3.3142402172088623, 'learning_rate': 6.371691873379059e-06}[Rank 2] Trainer log: {'loss': 0.8302, 'grad_norm': 3.3142402172088623, 'learning_rate': 6.371691873379059e-06} [Rank 0] Trainer log: {'loss': 0.8302, 'grad_norm': 3.3142402172088623, 'learning_rate': 6.371691873379059e-06} {'loss': 0.8302, 'grad_norm': 3.3142402172088623, 'learning_rate': 6.371691873379059e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.9993, 'grad_norm': 5.476067066192627, 'learning_rate': 6.3653382374994364e-06}[Rank 3] Trainer log: {'loss': 0.9993, 'grad_norm': 5.476067066192627, 'learning_rate': 6.3653382374994364e-06}[Rank 1] Trainer log: {'loss': 0.9993, 'grad_norm': 5.476067066192627, 'learning_rate': 6.3653382374994364e-06} [Rank 0] Trainer log: {'loss': 0.9993, 'grad_norm': 5.476067066192627, 'learning_rate': 6.3653382374994364e-06} {'loss': 0.9993, 'grad_norm': 5.476067066192627, 'learning_rate': 6.3653382374994364e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8337, 'grad_norm': 3.8389108180999756, 'learning_rate': 6.358986291777814e-06}[Rank 0] Trainer log: {'loss': 0.8337, 'grad_norm': 3.8389108180999756, 'learning_rate': 6.358986291777814e-06} [Rank 2] Trainer log: {'loss': 0.8337, 'grad_norm': 3.8389108180999756, 'learning_rate': 6.358986291777814e-06} [Rank 3] Trainer log: {'loss': 0.8337, 'grad_norm': 3.8389108180999756, 'learning_rate': 6.358986291777814e-06} {'loss': 0.8337, 'grad_norm': 3.8389108180999756, 'learning_rate': 6.358986291777814e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8066, 'grad_norm': 2.7967324256896973, 'learning_rate': 6.352636039167923e-06}[Rank 3] Trainer log: {'loss': 0.8066, 'grad_norm': 2.7967324256896973, 'learning_rate': 6.352636039167923e-06} [Rank 0] Trainer log: {'loss': 0.8066, 'grad_norm': 2.7967324256896973, 'learning_rate': 6.352636039167923e-06}[Rank 2] Trainer log: {'loss': 0.8066, 'grad_norm': 2.7967324256896973, 'learning_rate': 6.352636039167923e-06} {'loss': 0.8066, 'grad_norm': 2.7967324256896973, 'learning_rate': 6.352636039167923e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.9892, 'grad_norm': 8.17422866821289, 'learning_rate': 6.346287482622697e-06} [Rank 0] Trainer log: {'loss': 0.9892, 'grad_norm': 8.17422866821289, 'learning_rate': 6.346287482622697e-06}[Rank 1] Trainer log: {'loss': 0.9892, 'grad_norm': 8.17422866821289, 'learning_rate': 6.346287482622697e-06} [Rank 3] Trainer log: {'loss': 0.9892, 'grad_norm': 8.17422866821289, 'learning_rate': 6.346287482622697e-06} {'loss': 0.9892, 'grad_norm': 8.17422866821289, 'learning_rate': 6.346287482622697e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.6869, 'grad_norm': 6.638793468475342, 'learning_rate': 6.3399406250942845e-06}[Rank 1] Trainer log: {'loss': 0.6869, 'grad_norm': 6.638793468475342, 'learning_rate': 6.3399406250942845e-06}[Rank 0] Trainer log: {'loss': 0.6869, 'grad_norm': 6.638793468475342, 'learning_rate': 6.3399406250942845e-06} [Rank 3] Trainer log: {'loss': 0.6869, 'grad_norm': 6.638793468475342, 'learning_rate': 6.3399406250942845e-06} {'loss': 0.6869, 'grad_norm': 6.638793468475342, 'learning_rate': 6.3399406250942845e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.9227, 'grad_norm': 8.64431095123291, 'learning_rate': 6.3335954695340485e-06}[Rank 1] Trainer log: {'loss': 0.9227, 'grad_norm': 8.64431095123291, 'learning_rate': 6.3335954695340485e-06} [Rank 3] Trainer log: {'loss': 0.9227, 'grad_norm': 8.64431095123291, 'learning_rate': 6.3335954695340485e-06} [Rank 0] Trainer log: {'loss': 0.9227, 'grad_norm': 8.64431095123291, 'learning_rate': 6.3335954695340485e-06} {'loss': 0.9227, 'grad_norm': 8.64431095123291, 'learning_rate': 6.3335954695340485e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8289, 'grad_norm': 2.515284538269043, 'learning_rate': 6.327252018892553e-06}[Rank 2] Trainer log: {'loss': 0.8289, 'grad_norm': 2.515284538269043, 'learning_rate': 6.327252018892553e-06} [Rank 0] Trainer log: {'loss': 0.8289, 'grad_norm': 2.515284538269043, 'learning_rate': 6.327252018892553e-06} [Rank 3] Trainer log: {'loss': 0.8289, 'grad_norm': 2.515284538269043, 'learning_rate': 6.327252018892553e-06} {'loss': 0.8289, 'grad_norm': 2.515284538269043, 'learning_rate': 6.327252018892553e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8396, 'grad_norm': 7.63027811050415, 'learning_rate': 6.320910276119576e-06}[Rank 3] Trainer log: {'loss': 0.8396, 'grad_norm': 7.63027811050415, 'learning_rate': 6.320910276119576e-06} [Rank 0] Trainer log: {'loss': 0.8396, 'grad_norm': 7.63027811050415, 'learning_rate': 6.320910276119576e-06} [Rank 1] Trainer log: {'loss': 0.8396, 'grad_norm': 7.63027811050415, 'learning_rate': 6.320910276119576e-06} {'loss': 0.8396, 'grad_norm': 7.63027811050415, 'learning_rate': 6.320910276119576e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8168, 'grad_norm': 3.7000205516815186, 'learning_rate': 6.314570244164095e-06}[Rank 3] Trainer log: {'loss': 0.8168, 'grad_norm': 3.7000205516815186, 'learning_rate': 6.314570244164095e-06} [Rank 1] Trainer log: {'loss': 0.8168, 'grad_norm': 3.7000205516815186, 'learning_rate': 6.314570244164095e-06} [Rank 0] Trainer log: {'loss': 0.8168, 'grad_norm': 3.7000205516815186, 'learning_rate': 6.314570244164095e-06} {'loss': 0.8168, 'grad_norm': 3.7000205516815186, 'learning_rate': 6.314570244164095e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.7704, 'grad_norm': 2.9262771606445312, 'learning_rate': 6.308231925974299e-06} [Rank 3] Trainer log: {'loss': 0.7704, 'grad_norm': 2.9262771606445312, 'learning_rate': 6.308231925974299e-06} [Rank 1] Trainer log: {'loss': 0.7704, 'grad_norm': 2.9262771606445312, 'learning_rate': 6.308231925974299e-06} [Rank 0] Trainer log: {'loss': 0.7704, 'grad_norm': 2.9262771606445312, 'learning_rate': 6.308231925974299e-06} {'loss': 0.7704, 'grad_norm': 2.9262771606445312, 'learning_rate': 6.308231925974299e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.9356, 'grad_norm': 4.616698741912842, 'learning_rate': 6.301895324497569e-06} [Rank 3] Trainer log: {'loss': 0.9356, 'grad_norm': 4.616698741912842, 'learning_rate': 6.301895324497569e-06}[Rank 1] Trainer log: {'loss': 0.9356, 'grad_norm': 4.616698741912842, 'learning_rate': 6.301895324497569e-06} [Rank 0] Trainer log: {'loss': 0.9356, 'grad_norm': 4.616698741912842, 'learning_rate': 6.301895324497569e-06} {'loss': 0.9356, 'grad_norm': 4.616698741912842, 'learning_rate': 6.301895324497569e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.692, 'grad_norm': 3.969125747680664, 'learning_rate': 6.295560442680505e-06} [Rank 2] Trainer log: {'loss': 0.692, 'grad_norm': 3.969125747680664, 'learning_rate': 6.295560442680505e-06}[Rank 3] Trainer log: {'loss': 0.692, 'grad_norm': 3.969125747680664, 'learning_rate': 6.295560442680505e-06} [Rank 0] Trainer log: {'loss': 0.692, 'grad_norm': 3.969125747680664, 'learning_rate': 6.295560442680505e-06} {'loss': 0.692, 'grad_norm': 3.969125747680664, 'learning_rate': 6.295560442680505e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8941, 'grad_norm': 6.0137038230896, 'learning_rate': 6.289227283468889e-06} [Rank 3] Trainer log: {'loss': 0.8941, 'grad_norm': 6.0137038230896, 'learning_rate': 6.289227283468889e-06}[Rank 0] Trainer log: {'loss': 0.8941, 'grad_norm': 6.0137038230896, 'learning_rate': 6.289227283468889e-06} [Rank 1] Trainer log: {'loss': 0.8941, 'grad_norm': 6.0137038230896, 'learning_rate': 6.289227283468889e-06} {'loss': 0.8941, 'grad_norm': 6.0137038230896, 'learning_rate': 6.289227283468889e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8103, 'grad_norm': 3.6183009147644043, 'learning_rate': 6.282895849807712e-06}[Rank 1] Trainer log: {'loss': 0.8103, 'grad_norm': 3.6183009147644043, 'learning_rate': 6.282895849807712e-06} [Rank 0] Trainer log: {'loss': 0.8103, 'grad_norm': 3.6183009147644043, 'learning_rate': 6.282895849807712e-06} [Rank 3] Trainer log: {'loss': 0.8103, 'grad_norm': 3.6183009147644043, 'learning_rate': 6.282895849807712e-06} {'loss': 0.8103, 'grad_norm': 3.6183009147644043, 'learning_rate': 6.282895849807712e-06, 'epoch': 0.64} [Rank 3] Trainer log: {'loss': 0.804, 'grad_norm': 3.1626737117767334, 'learning_rate': 6.276566144641165e-06} [Rank 1] Trainer log: {'loss': 0.804, 'grad_norm': 3.1626737117767334, 'learning_rate': 6.276566144641165e-06}[Rank 2] Trainer log: {'loss': 0.804, 'grad_norm': 3.1626737117767334, 'learning_rate': 6.276566144641165e-06} [Rank 0] Trainer log: {'loss': 0.804, 'grad_norm': 3.1626737117767334, 'learning_rate': 6.276566144641165e-06} {'loss': 0.804, 'grad_norm': 3.1626737117767334, 'learning_rate': 6.276566144641165e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.6183, 'grad_norm': 4.0099005699157715, 'learning_rate': 6.270238170912628e-06}[Rank 3] Trainer log: {'loss': 0.6183, 'grad_norm': 4.0099005699157715, 'learning_rate': 6.270238170912628e-06} [Rank 1] Trainer log: {'loss': 0.6183, 'grad_norm': 4.0099005699157715, 'learning_rate': 6.270238170912628e-06} [Rank 0] Trainer log: {'loss': 0.6183, 'grad_norm': 4.0099005699157715, 'learning_rate': 6.270238170912628e-06} {'loss': 0.6183, 'grad_norm': 4.0099005699157715, 'learning_rate': 6.270238170912628e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8469, 'grad_norm': 3.9255034923553467, 'learning_rate': 6.263911931564677e-06}[Rank 3] Trainer log: {'loss': 0.8469, 'grad_norm': 3.9255034923553467, 'learning_rate': 6.263911931564677e-06}[Rank 2] Trainer log: {'loss': 0.8469, 'grad_norm': 3.9255034923553467, 'learning_rate': 6.263911931564677e-06} [Rank 0] Trainer log: {'loss': 0.8469, 'grad_norm': 3.9255034923553467, 'learning_rate': 6.263911931564677e-06} {'loss': 0.8469, 'grad_norm': 3.9255034923553467, 'learning_rate': 6.263911931564677e-06, 'epoch': 0.64} [Rank 0] Trainer log: {'loss': 0.7585, 'grad_norm': 21.853429794311523, 'learning_rate': 6.257587429539087e-06}[Rank 2] Trainer log: {'loss': 0.7585, 'grad_norm': 21.853429794311523, 'learning_rate': 6.257587429539087e-06} [Rank 3] Trainer log: {'loss': 0.7585, 'grad_norm': 21.853429794311523, 'learning_rate': 6.257587429539087e-06} [Rank 1] Trainer log: {'loss': 0.7585, 'grad_norm': 21.853429794311523, 'learning_rate': 6.257587429539087e-06} {'loss': 0.7585, 'grad_norm': 21.853429794311523, 'learning_rate': 6.257587429539087e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8448, 'grad_norm': 8.074460983276367, 'learning_rate': 6.251264667776821e-06}[Rank 3] Trainer log: {'loss': 0.8448, 'grad_norm': 8.074460983276367, 'learning_rate': 6.251264667776821e-06} [Rank 0] Trainer log: {'loss': 0.8448, 'grad_norm': 8.074460983276367, 'learning_rate': 6.251264667776821e-06}[Rank 2] Trainer log: {'loss': 0.8448, 'grad_norm': 8.074460983276367, 'learning_rate': 6.251264667776821e-06} {'loss': 0.8448, 'grad_norm': 8.074460983276367, 'learning_rate': 6.251264667776821e-06, 'epoch': 0.64} [Rank 3] Trainer log: {'loss': 0.7795, 'grad_norm': 4.615368843078613, 'learning_rate': 6.24494364921803e-06}[Rank 1] Trainer log: {'loss': 0.7795, 'grad_norm': 4.615368843078613, 'learning_rate': 6.24494364921803e-06}[Rank 2] Trainer log: {'loss': 0.7795, 'grad_norm': 4.615368843078613, 'learning_rate': 6.24494364921803e-06} [Rank 0] Trainer log: {'loss': 0.7795, 'grad_norm': 4.615368843078613, 'learning_rate': 6.24494364921803e-06} {'loss': 0.7795, 'grad_norm': 4.615368843078613, 'learning_rate': 6.24494364921803e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 1.0518, 'grad_norm': 2.3472797870635986, 'learning_rate': 6.238624376802058e-06}[Rank 1] Trainer log: {'loss': 1.0518, 'grad_norm': 2.3472797870635986, 'learning_rate': 6.238624376802058e-06} [Rank 0] Trainer log: {'loss': 1.0518, 'grad_norm': 2.3472797870635986, 'learning_rate': 6.238624376802058e-06}[Rank 3] Trainer log: {'loss': 1.0518, 'grad_norm': 2.3472797870635986, 'learning_rate': 6.238624376802058e-06} {'loss': 1.0518, 'grad_norm': 2.3472797870635986, 'learning_rate': 6.238624376802058e-06, 'epoch': 0.64} [Rank 0] Trainer log: {'loss': 0.6393, 'grad_norm': 2.024301767349243, 'learning_rate': 6.232306853467443e-06}[Rank 3] Trainer log: {'loss': 0.6393, 'grad_norm': 2.024301767349243, 'learning_rate': 6.232306853467443e-06}[Rank 2] Trainer log: {'loss': 0.6393, 'grad_norm': 2.024301767349243, 'learning_rate': 6.232306853467443e-06} [Rank 1] Trainer log: {'loss': 0.6393, 'grad_norm': 2.024301767349243, 'learning_rate': 6.232306853467443e-06} {'loss': 0.6393, 'grad_norm': 2.024301767349243, 'learning_rate': 6.232306853467443e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8158, 'grad_norm': 4.464213848114014, 'learning_rate': 6.2259910821519e-06}[Rank 0] Trainer log: {'loss': 0.8158, 'grad_norm': 4.464213848114014, 'learning_rate': 6.2259910821519e-06}[Rank 3] Trainer log: {'loss': 0.8158, 'grad_norm': 4.464213848114014, 'learning_rate': 6.2259910821519e-06} [Rank 1] Trainer log: {'loss': 0.8158, 'grad_norm': 4.464213848114014, 'learning_rate': 6.2259910821519e-06} {'loss': 0.8158, 'grad_norm': 4.464213848114014, 'learning_rate': 6.2259910821519e-06, 'epoch': 0.64} [Rank 0] Trainer log: {'loss': 1.0179, 'grad_norm': 9.846531867980957, 'learning_rate': 6.2196770657923265e-06}[Rank 2] Trainer log: {'loss': 1.0179, 'grad_norm': 9.846531867980957, 'learning_rate': 6.2196770657923265e-06}[Rank 3] Trainer log: {'loss': 1.0179, 'grad_norm': 9.846531867980957, 'learning_rate': 6.2196770657923265e-06} [Rank 1] Trainer log: {'loss': 1.0179, 'grad_norm': 9.846531867980957, 'learning_rate': 6.2196770657923265e-06} {'loss': 1.0179, 'grad_norm': 9.846531867980957, 'learning_rate': 6.2196770657923265e-06, 'epoch': 0.64} [Rank 3] Trainer log: {'loss': 0.8538, 'grad_norm': 4.066442966461182, 'learning_rate': 6.213364807324817e-06}[Rank 2] Trainer log: {'loss': 0.8538, 'grad_norm': 4.066442966461182, 'learning_rate': 6.213364807324817e-06}[Rank 1] Trainer log: {'loss': 0.8538, 'grad_norm': 4.066442966461182, 'learning_rate': 6.213364807324817e-06} [Rank 0] Trainer log: {'loss': 0.8538, 'grad_norm': 4.066442966461182, 'learning_rate': 6.213364807324817e-06} {'loss': 0.8538, 'grad_norm': 4.066442966461182, 'learning_rate': 6.213364807324817e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 1.07, 'grad_norm': 3.051090717315674, 'learning_rate': 6.207054309684644e-06} [Rank 1] Trainer log: {'loss': 1.07, 'grad_norm': 3.051090717315674, 'learning_rate': 6.207054309684644e-06} [Rank 3] Trainer log: {'loss': 1.07, 'grad_norm': 3.051090717315674, 'learning_rate': 6.207054309684644e-06} [Rank 0] Trainer log: {'loss': 1.07, 'grad_norm': 3.051090717315674, 'learning_rate': 6.207054309684644e-06} {'loss': 1.07, 'grad_norm': 3.051090717315674, 'learning_rate': 6.207054309684644e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8062, 'grad_norm': 9.91240119934082, 'learning_rate': 6.200745575806249e-06}[Rank 1] Trainer log: {'loss': 0.8062, 'grad_norm': 9.91240119934082, 'learning_rate': 6.200745575806249e-06}[Rank 3] Trainer log: {'loss': 0.8062, 'grad_norm': 9.91240119934082, 'learning_rate': 6.200745575806249e-06} [Rank 0] Trainer log: {'loss': 0.8062, 'grad_norm': 9.91240119934082, 'learning_rate': 6.200745575806249e-06} {'loss': 0.8062, 'grad_norm': 9.91240119934082, 'learning_rate': 6.200745575806249e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.6831, 'grad_norm': 2.6689178943634033, 'learning_rate': 6.194438608623272e-06}[Rank 2] Trainer log: {'loss': 0.6831, 'grad_norm': 2.6689178943634033, 'learning_rate': 6.194438608623272e-06}[Rank 0] Trainer log: {'loss': 0.6831, 'grad_norm': 2.6689178943634033, 'learning_rate': 6.194438608623272e-06} [Rank 3] Trainer log: {'loss': 0.6831, 'grad_norm': 2.6689178943634033, 'learning_rate': 6.194438608623272e-06} {'loss': 0.6831, 'grad_norm': 2.6689178943634033, 'learning_rate': 6.194438608623272e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.901, 'grad_norm': 5.154301166534424, 'learning_rate': 6.1881334110685185e-06}[Rank 1] Trainer log: {'loss': 0.901, 'grad_norm': 5.154301166534424, 'learning_rate': 6.1881334110685185e-06} [Rank 3] Trainer log: {'loss': 0.901, 'grad_norm': 5.154301166534424, 'learning_rate': 6.1881334110685185e-06} [Rank 0] Trainer log: {'loss': 0.901, 'grad_norm': 5.154301166534424, 'learning_rate': 6.1881334110685185e-06} {'loss': 0.901, 'grad_norm': 5.154301166534424, 'learning_rate': 6.1881334110685185e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8698, 'grad_norm': 11.61998176574707, 'learning_rate': 6.181829986073975e-06} [Rank 2] Trainer log: {'loss': 0.8698, 'grad_norm': 11.61998176574707, 'learning_rate': 6.181829986073975e-06} [Rank 0] Trainer log: {'loss': 0.8698, 'grad_norm': 11.61998176574707, 'learning_rate': 6.181829986073975e-06}[Rank 3] Trainer log: {'loss': 0.8698, 'grad_norm': 11.61998176574707, 'learning_rate': 6.181829986073975e-06} {'loss': 0.8698, 'grad_norm': 11.61998176574707, 'learning_rate': 6.181829986073975e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8877, 'grad_norm': 2.8878791332244873, 'learning_rate': 6.175528336570809e-06}[Rank 1] Trainer log: {'loss': 0.8877, 'grad_norm': 2.8878791332244873, 'learning_rate': 6.175528336570809e-06} [Rank 3] Trainer log: {'loss': 0.8877, 'grad_norm': 2.8878791332244873, 'learning_rate': 6.175528336570809e-06} [Rank 0] Trainer log: {'loss': 0.8877, 'grad_norm': 2.8878791332244873, 'learning_rate': 6.175528336570809e-06} {'loss': 0.8877, 'grad_norm': 2.8878791332244873, 'learning_rate': 6.175528336570809e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8062, 'grad_norm': 5.77826452255249, 'learning_rate': 6.169228465489354e-06}[Rank 1] Trainer log: {'loss': 0.8062, 'grad_norm': 5.77826452255249, 'learning_rate': 6.169228465489354e-06} [Rank 3] Trainer log: {'loss': 0.8062, 'grad_norm': 5.77826452255249, 'learning_rate': 6.169228465489354e-06} [Rank 0] Trainer log: {'loss': 0.8062, 'grad_norm': 5.77826452255249, 'learning_rate': 6.169228465489354e-06} {'loss': 0.8062, 'grad_norm': 5.77826452255249, 'learning_rate': 6.169228465489354e-06, 'epoch': 0.64} [Rank 1] Trainer log: {'loss': 0.8031, 'grad_norm': 5.0297017097473145, 'learning_rate': 6.162930375759114e-06}[Rank 2] Trainer log: {'loss': 0.8031, 'grad_norm': 5.0297017097473145, 'learning_rate': 6.162930375759114e-06} [Rank 0] Trainer log: {'loss': 0.8031, 'grad_norm': 5.0297017097473145, 'learning_rate': 6.162930375759114e-06} [Rank 3] Trainer log: {'loss': 0.8031, 'grad_norm': 5.0297017097473145, 'learning_rate': 6.162930375759114e-06} {'loss': 0.8031, 'grad_norm': 5.0297017097473145, 'learning_rate': 6.162930375759114e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.7608, 'grad_norm': 3.8420512676239014, 'learning_rate': 6.1566340703087805e-06}[Rank 3] Trainer log: {'loss': 0.7608, 'grad_norm': 3.8420512676239014, 'learning_rate': 6.1566340703087805e-06} [Rank 0] Trainer log: {'loss': 0.7608, 'grad_norm': 3.8420512676239014, 'learning_rate': 6.1566340703087805e-06}[Rank 1] Trainer log: {'loss': 0.7608, 'grad_norm': 3.8420512676239014, 'learning_rate': 6.1566340703087805e-06} {'loss': 0.7608, 'grad_norm': 3.8420512676239014, 'learning_rate': 6.1566340703087805e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.562, 'grad_norm': 6.588086128234863, 'learning_rate': 6.1503395520662e-06}[Rank 0] Trainer log: {'loss': 0.562, 'grad_norm': 6.588086128234863, 'learning_rate': 6.1503395520662e-06}[Rank 1] Trainer log: {'loss': 0.562, 'grad_norm': 6.588086128234863, 'learning_rate': 6.1503395520662e-06} [Rank 3] Trainer log: {'loss': 0.562, 'grad_norm': 6.588086128234863, 'learning_rate': 6.1503395520662e-06} {'loss': 0.562, 'grad_norm': 6.588086128234863, 'learning_rate': 6.1503395520662e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.7022, 'grad_norm': 5.43993616104126, 'learning_rate': 6.144046823958392e-06}[Rank 1] Trainer log: {'loss': 0.7022, 'grad_norm': 5.43993616104126, 'learning_rate': 6.144046823958392e-06} [Rank 3] Trainer log: {'loss': 0.7022, 'grad_norm': 5.43993616104126, 'learning_rate': 6.144046823958392e-06} [Rank 0] Trainer log: {'loss': 0.7022, 'grad_norm': 5.43993616104126, 'learning_rate': 6.144046823958392e-06} {'loss': 0.7022, 'grad_norm': 5.43993616104126, 'learning_rate': 6.144046823958392e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8959, 'grad_norm': 2.716481924057007, 'learning_rate': 6.137755888911551e-06} [Rank 1] Trainer log: {'loss': 0.8959, 'grad_norm': 2.716481924057007, 'learning_rate': 6.137755888911551e-06} [Rank 3] Trainer log: {'loss': 0.8959, 'grad_norm': 2.716481924057007, 'learning_rate': 6.137755888911551e-06} [Rank 0] Trainer log: {'loss': 0.8959, 'grad_norm': 2.716481924057007, 'learning_rate': 6.137755888911551e-06} {'loss': 0.8959, 'grad_norm': 2.716481924057007, 'learning_rate': 6.137755888911551e-06, 'epoch': 0.64} [Rank 2] Trainer log: {'loss': 0.8474, 'grad_norm': 9.567503929138184, 'learning_rate': 6.131466749851025e-06} [Rank 1] Trainer log: {'loss': 0.8474, 'grad_norm': 9.567503929138184, 'learning_rate': 6.131466749851025e-06} [Rank 3] Trainer log: {'loss': 0.8474, 'grad_norm': 9.567503929138184, 'learning_rate': 6.131466749851025e-06} [Rank 0] Trainer log: {'loss': 0.8474, 'grad_norm': 9.567503929138184, 'learning_rate': 6.131466749851025e-06} {'loss': 0.8474, 'grad_norm': 9.567503929138184, 'learning_rate': 6.131466749851025e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.9451, 'grad_norm': 3.964350461959839, 'learning_rate': 6.125179409701334e-06} [Rank 2] Trainer log: {'loss': 0.9451, 'grad_norm': 3.964350461959839, 'learning_rate': 6.125179409701334e-06}[Rank 0] Trainer log: {'loss': 0.9451, 'grad_norm': 3.964350461959839, 'learning_rate': 6.125179409701334e-06} [Rank 3] Trainer log: {'loss': 0.9451, 'grad_norm': 3.964350461959839, 'learning_rate': 6.125179409701334e-06} {'loss': 0.9451, 'grad_norm': 3.964350461959839, 'learning_rate': 6.125179409701334e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.6067, 'grad_norm': 3.9131832122802734, 'learning_rate': 6.118893871386167e-06}[Rank 3] Trainer log: {'loss': 0.6067, 'grad_norm': 3.9131832122802734, 'learning_rate': 6.118893871386167e-06} [Rank 1] Trainer log: {'loss': 0.6067, 'grad_norm': 3.9131832122802734, 'learning_rate': 6.118893871386167e-06} [Rank 0] Trainer log: {'loss': 0.6067, 'grad_norm': 3.9131832122802734, 'learning_rate': 6.118893871386167e-06} {'loss': 0.6067, 'grad_norm': 3.9131832122802734, 'learning_rate': 6.118893871386167e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.944, 'grad_norm': 3.97452449798584, 'learning_rate': 6.112610137828363e-06} [Rank 1] Trainer log: {'loss': 0.944, 'grad_norm': 3.97452449798584, 'learning_rate': 6.112610137828363e-06} [Rank 0] Trainer log: {'loss': 0.944, 'grad_norm': 3.97452449798584, 'learning_rate': 6.112610137828363e-06}[Rank 3] Trainer log: {'loss': 0.944, 'grad_norm': 3.97452449798584, 'learning_rate': 6.112610137828363e-06} {'loss': 0.944, 'grad_norm': 3.97452449798584, 'learning_rate': 6.112610137828363e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.9103, 'grad_norm': 9.79313850402832, 'learning_rate': 6.106328211949928e-06}[Rank 2] Trainer log: {'loss': 0.9103, 'grad_norm': 9.79313850402832, 'learning_rate': 6.106328211949928e-06} [Rank 3] Trainer log: {'loss': 0.9103, 'grad_norm': 9.79313850402832, 'learning_rate': 6.106328211949928e-06} [Rank 0] Trainer log: {'loss': 0.9103, 'grad_norm': 9.79313850402832, 'learning_rate': 6.106328211949928e-06} {'loss': 0.9103, 'grad_norm': 9.79313850402832, 'learning_rate': 6.106328211949928e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.5941, 'grad_norm': 7.217923164367676, 'learning_rate': 6.100048096672029e-06}[Rank 3] Trainer log: {'loss': 0.5941, 'grad_norm': 7.217923164367676, 'learning_rate': 6.100048096672029e-06} [Rank 0] Trainer log: {'loss': 0.5941, 'grad_norm': 7.217923164367676, 'learning_rate': 6.100048096672029e-06} [Rank 1] Trainer log: {'loss': 0.5941, 'grad_norm': 7.217923164367676, 'learning_rate': 6.100048096672029e-06} {'loss': 0.5941, 'grad_norm': 7.217923164367676, 'learning_rate': 6.100048096672029e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.8548, 'grad_norm': 5.958413124084473, 'learning_rate': 6.09376979491499e-06}[Rank 3] Trainer log: {'loss': 0.8548, 'grad_norm': 5.958413124084473, 'learning_rate': 6.09376979491499e-06} [Rank 1] Trainer log: {'loss': 0.8548, 'grad_norm': 5.958413124084473, 'learning_rate': 6.09376979491499e-06} [Rank 0] Trainer log: {'loss': 0.8548, 'grad_norm': 5.958413124084473, 'learning_rate': 6.09376979491499e-06} {'loss': 0.8548, 'grad_norm': 5.958413124084473, 'learning_rate': 6.09376979491499e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.8975, 'grad_norm': 4.447427749633789, 'learning_rate': 6.0874933095982915e-06} [Rank 1] Trainer log: {'loss': 0.8975, 'grad_norm': 4.447427749633789, 'learning_rate': 6.0874933095982915e-06} [Rank 0] Trainer log: {'loss': 0.8975, 'grad_norm': 4.447427749633789, 'learning_rate': 6.0874933095982915e-06}[Rank 3] Trainer log: {'loss': 0.8975, 'grad_norm': 4.447427749633789, 'learning_rate': 6.0874933095982915e-06} {'loss': 0.8975, 'grad_norm': 4.447427749633789, 'learning_rate': 6.0874933095982915e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.7018, 'grad_norm': 2.1844675540924072, 'learning_rate': 6.0812186436405605e-06}[Rank 2] Trainer log: {'loss': 0.7018, 'grad_norm': 2.1844675540924072, 'learning_rate': 6.0812186436405605e-06} [Rank 3] Trainer log: {'loss': 0.7018, 'grad_norm': 2.1844675540924072, 'learning_rate': 6.0812186436405605e-06} [Rank 0] Trainer log: {'loss': 0.7018, 'grad_norm': 2.1844675540924072, 'learning_rate': 6.0812186436405605e-06} {'loss': 0.7018, 'grad_norm': 2.1844675540924072, 'learning_rate': 6.0812186436405605e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 1.0295, 'grad_norm': 4.0844035148620605, 'learning_rate': 6.0749457999595976e-06}[Rank 0] Trainer log: {'loss': 1.0295, 'grad_norm': 4.0844035148620605, 'learning_rate': 6.0749457999595976e-06}[Rank 1] Trainer log: {'loss': 1.0295, 'grad_norm': 4.0844035148620605, 'learning_rate': 6.0749457999595976e-06} [Rank 3] Trainer log: {'loss': 1.0295, 'grad_norm': 4.0844035148620605, 'learning_rate': 6.0749457999595976e-06} {'loss': 1.0295, 'grad_norm': 4.0844035148620605, 'learning_rate': 6.0749457999595976e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.5311, 'grad_norm': 3.7694811820983887, 'learning_rate': 6.06867478147234e-06}[Rank 3] Trainer log: {'loss': 0.5311, 'grad_norm': 3.7694811820983887, 'learning_rate': 6.06867478147234e-06}[Rank 2] Trainer log: {'loss': 0.5311, 'grad_norm': 3.7694811820983887, 'learning_rate': 6.06867478147234e-06} [Rank 0] Trainer log: {'loss': 0.5311, 'grad_norm': 3.7694811820983887, 'learning_rate': 6.06867478147234e-06} {'loss': 0.5311, 'grad_norm': 3.7694811820983887, 'learning_rate': 6.06867478147234e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.8136, 'grad_norm': 4.683138847351074, 'learning_rate': 6.062405591094876e-06}[Rank 3] Trainer log: {'loss': 0.8136, 'grad_norm': 4.683138847351074, 'learning_rate': 6.062405591094876e-06}[Rank 0] Trainer log: {'loss': 0.8136, 'grad_norm': 4.683138847351074, 'learning_rate': 6.062405591094876e-06} [Rank 1] Trainer log: {'loss': 0.8136, 'grad_norm': 4.683138847351074, 'learning_rate': 6.062405591094876e-06} {'loss': 0.8136, 'grad_norm': 4.683138847351074, 'learning_rate': 6.062405591094876e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.7036, 'grad_norm': 10.733972549438477, 'learning_rate': 6.056138231742459e-06}[Rank 2] Trainer log: {'loss': 0.7036, 'grad_norm': 10.733972549438477, 'learning_rate': 6.056138231742459e-06} [Rank 3] Trainer log: {'loss': 0.7036, 'grad_norm': 10.733972549438477, 'learning_rate': 6.056138231742459e-06} [Rank 0] Trainer log: {'loss': 0.7036, 'grad_norm': 10.733972549438477, 'learning_rate': 6.056138231742459e-06} {'loss': 0.7036, 'grad_norm': 10.733972549438477, 'learning_rate': 6.056138231742459e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.6354, 'grad_norm': 7.578339576721191, 'learning_rate': 6.0498727063294714e-06} [Rank 1] Trainer log: {'loss': 0.6354, 'grad_norm': 7.578339576721191, 'learning_rate': 6.0498727063294714e-06}[Rank 3] Trainer log: {'loss': 0.6354, 'grad_norm': 7.578339576721191, 'learning_rate': 6.0498727063294714e-06} [Rank 0] Trainer log: {'loss': 0.6354, 'grad_norm': 7.578339576721191, 'learning_rate': 6.0498727063294714e-06} {'loss': 0.6354, 'grad_norm': 7.578339576721191, 'learning_rate': 6.0498727063294714e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.8558, 'grad_norm': 2.7026236057281494, 'learning_rate': 6.043609017769453e-06} [Rank 3] Trainer log: {'loss': 0.8558, 'grad_norm': 2.7026236057281494, 'learning_rate': 6.043609017769453e-06}[Rank 1] Trainer log: {'loss': 0.8558, 'grad_norm': 2.7026236057281494, 'learning_rate': 6.043609017769453e-06} [Rank 0] Trainer log: {'loss': 0.8558, 'grad_norm': 2.7026236057281494, 'learning_rate': 6.043609017769453e-06} {'loss': 0.8558, 'grad_norm': 2.7026236057281494, 'learning_rate': 6.043609017769453e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 1.0233, 'grad_norm': 4.656903266906738, 'learning_rate': 6.037347168975094e-06}[Rank 0] Trainer log: {'loss': 1.0233, 'grad_norm': 4.656903266906738, 'learning_rate': 6.037347168975094e-06} [Rank 2] Trainer log: {'loss': 1.0233, 'grad_norm': 4.656903266906738, 'learning_rate': 6.037347168975094e-06} [Rank 3] Trainer log: {'loss': 1.0233, 'grad_norm': 4.656903266906738, 'learning_rate': 6.037347168975094e-06} {'loss': 1.0233, 'grad_norm': 4.656903266906738, 'learning_rate': 6.037347168975094e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.7971, 'grad_norm': 5.14186954498291, 'learning_rate': 6.031087162858221e-06}[Rank 0] Trainer log: {'loss': 0.7971, 'grad_norm': 5.14186954498291, 'learning_rate': 6.031087162858221e-06} [Rank 2] Trainer log: {'loss': 0.7971, 'grad_norm': 5.14186954498291, 'learning_rate': 6.031087162858221e-06} [Rank 3] Trainer log: {'loss': 0.7971, 'grad_norm': 5.14186954498291, 'learning_rate': 6.031087162858221e-06} {'loss': 0.7971, 'grad_norm': 5.14186954498291, 'learning_rate': 6.031087162858221e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.8352, 'grad_norm': 2.724268674850464, 'learning_rate': 6.024829002329802e-06}[Rank 1] Trainer log: {'loss': 0.8352, 'grad_norm': 2.724268674850464, 'learning_rate': 6.024829002329802e-06}[Rank 3] Trainer log: {'loss': 0.8352, 'grad_norm': 2.724268674850464, 'learning_rate': 6.024829002329802e-06} [Rank 0] Trainer log: {'loss': 0.8352, 'grad_norm': 2.724268674850464, 'learning_rate': 6.024829002329802e-06} {'loss': 0.8352, 'grad_norm': 2.724268674850464, 'learning_rate': 6.024829002329802e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 1.1049, 'grad_norm': 4.678193092346191, 'learning_rate': 6.018572690299956e-06}[Rank 2] Trainer log: {'loss': 1.1049, 'grad_norm': 4.678193092346191, 'learning_rate': 6.018572690299956e-06} [Rank 0] Trainer log: {'loss': 1.1049, 'grad_norm': 4.678193092346191, 'learning_rate': 6.018572690299956e-06} [Rank 3] Trainer log: {'loss': 1.1049, 'grad_norm': 4.678193092346191, 'learning_rate': 6.018572690299956e-06} {'loss': 1.1049, 'grad_norm': 4.678193092346191, 'learning_rate': 6.018572690299956e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.75, 'grad_norm': 7.005396366119385, 'learning_rate': 6.0123182296779355e-06} [Rank 3] Trainer log: {'loss': 0.75, 'grad_norm': 7.005396366119385, 'learning_rate': 6.0123182296779355e-06} [Rank 0] Trainer log: {'loss': 0.75, 'grad_norm': 7.005396366119385, 'learning_rate': 6.0123182296779355e-06}[Rank 1] Trainer log: {'loss': 0.75, 'grad_norm': 7.005396366119385, 'learning_rate': 6.0123182296779355e-06} {'loss': 0.75, 'grad_norm': 7.005396366119385, 'learning_rate': 6.0123182296779355e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 1.0532, 'grad_norm': 2.6553382873535156, 'learning_rate': 6.006065623372132e-06} [Rank 2] Trainer log: {'loss': 1.0532, 'grad_norm': 2.6553382873535156, 'learning_rate': 6.006065623372132e-06} [Rank 0] Trainer log: {'loss': 1.0532, 'grad_norm': 2.6553382873535156, 'learning_rate': 6.006065623372132e-06} [Rank 1] Trainer log: {'loss': 1.0532, 'grad_norm': 2.6553382873535156, 'learning_rate': 6.006065623372132e-06} {'loss': 1.0532, 'grad_norm': 2.6553382873535156, 'learning_rate': 6.006065623372132e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.9356, 'grad_norm': 8.46938705444336, 'learning_rate': 5.999814874290084e-06}[Rank 3] Trainer log: {'loss': 0.9356, 'grad_norm': 8.46938705444336, 'learning_rate': 5.999814874290084e-06}[Rank 1] Trainer log: {'loss': 0.9356, 'grad_norm': 8.46938705444336, 'learning_rate': 5.999814874290084e-06} [Rank 0] Trainer log: {'loss': 0.9356, 'grad_norm': 8.46938705444336, 'learning_rate': 5.999814874290084e-06} {'loss': 0.9356, 'grad_norm': 8.46938705444336, 'learning_rate': 5.999814874290084e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 0.9358, 'grad_norm': 6.367713451385498, 'learning_rate': 5.993565985338451e-06} [Rank 1] Trainer log: {'loss': 0.9358, 'grad_norm': 6.367713451385498, 'learning_rate': 5.993565985338451e-06} [Rank 2] Trainer log: {'loss': 0.9358, 'grad_norm': 6.367713451385498, 'learning_rate': 5.993565985338451e-06} [Rank 0] Trainer log: {'loss': 0.9358, 'grad_norm': 6.367713451385498, 'learning_rate': 5.993565985338451e-06} {'loss': 0.9358, 'grad_norm': 6.367713451385498, 'learning_rate': 5.993565985338451e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.9176, 'grad_norm': 4.8165059089660645, 'learning_rate': 5.987318959423036e-06}[Rank 3] Trainer log: {'loss': 0.9176, 'grad_norm': 4.8165059089660645, 'learning_rate': 5.987318959423036e-06}[Rank 1] Trainer log: {'loss': 0.9176, 'grad_norm': 4.8165059089660645, 'learning_rate': 5.987318959423036e-06} [Rank 0] Trainer log: {'loss': 0.9176, 'grad_norm': 4.8165059089660645, 'learning_rate': 5.987318959423036e-06} {'loss': 0.9176, 'grad_norm': 4.8165059089660645, 'learning_rate': 5.987318959423036e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.7512, 'grad_norm': 4.05739164352417, 'learning_rate': 5.981073799448783e-06}[Rank 3] Trainer log: {'loss': 0.7512, 'grad_norm': 4.05739164352417, 'learning_rate': 5.981073799448783e-06} [Rank 1] Trainer log: {'loss': 0.7512, 'grad_norm': 4.05739164352417, 'learning_rate': 5.981073799448783e-06} [Rank 0] Trainer log: {'loss': 0.7512, 'grad_norm': 4.05739164352417, 'learning_rate': 5.981073799448783e-06} {'loss': 0.7512, 'grad_norm': 4.05739164352417, 'learning_rate': 5.981073799448783e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.8806, 'grad_norm': 3.657972812652588, 'learning_rate': 5.974830508319756e-06}[Rank 2] Trainer log: {'loss': 0.8806, 'grad_norm': 3.657972812652588, 'learning_rate': 5.974830508319756e-06} [Rank 3] Trainer log: {'loss': 0.8806, 'grad_norm': 3.657972812652588, 'learning_rate': 5.974830508319756e-06} [Rank 0] Trainer log: {'loss': 0.8806, 'grad_norm': 3.657972812652588, 'learning_rate': 5.974830508319756e-06} {'loss': 0.8806, 'grad_norm': 3.657972812652588, 'learning_rate': 5.974830508319756e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.9132, 'grad_norm': 2.9387872219085693, 'learning_rate': 5.96858908893915e-06}[Rank 3] Trainer log: {'loss': 0.9132, 'grad_norm': 2.9387872219085693, 'learning_rate': 5.96858908893915e-06} [Rank 2] Trainer log: {'loss': 0.9132, 'grad_norm': 2.9387872219085693, 'learning_rate': 5.96858908893915e-06} [Rank 0] Trainer log: {'loss': 0.9132, 'grad_norm': 2.9387872219085693, 'learning_rate': 5.96858908893915e-06} {'loss': 0.9132, 'grad_norm': 2.9387872219085693, 'learning_rate': 5.96858908893915e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 1.0619, 'grad_norm': 9.03840160369873, 'learning_rate': 5.962349544209299e-06}[Rank 2] Trainer log: {'loss': 1.0619, 'grad_norm': 9.03840160369873, 'learning_rate': 5.962349544209299e-06}[Rank 1] Trainer log: {'loss': 1.0619, 'grad_norm': 9.03840160369873, 'learning_rate': 5.962349544209299e-06} [Rank 0] Trainer log: {'loss': 1.0619, 'grad_norm': 9.03840160369873, 'learning_rate': 5.962349544209299e-06} {'loss': 1.0619, 'grad_norm': 9.03840160369873, 'learning_rate': 5.962349544209299e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.879, 'grad_norm': 4.041749477386475, 'learning_rate': 5.956111877031663e-06} [Rank 1] Trainer log: {'loss': 0.879, 'grad_norm': 4.041749477386475, 'learning_rate': 5.956111877031663e-06}[Rank 3] Trainer log: {'loss': 0.879, 'grad_norm': 4.041749477386475, 'learning_rate': 5.956111877031663e-06} [Rank 0] Trainer log: {'loss': 0.879, 'grad_norm': 4.041749477386475, 'learning_rate': 5.956111877031663e-06} {'loss': 0.879, 'grad_norm': 4.041749477386475, 'learning_rate': 5.956111877031663e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 1.0005, 'grad_norm': 1.9895282983779907, 'learning_rate': 5.949876090306819e-06}[Rank 1] Trainer log: {'loss': 1.0005, 'grad_norm': 1.9895282983779907, 'learning_rate': 5.949876090306819e-06}[Rank 0] Trainer log: {'loss': 1.0005, 'grad_norm': 1.9895282983779907, 'learning_rate': 5.949876090306819e-06} [Rank 2] Trainer log: {'loss': 1.0005, 'grad_norm': 1.9895282983779907, 'learning_rate': 5.949876090306819e-06} {'loss': 1.0005, 'grad_norm': 1.9895282983779907, 'learning_rate': 5.949876090306819e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.9056, 'grad_norm': 2.270080327987671, 'learning_rate': 5.943642186934484e-06} [Rank 3] Trainer log: {'loss': 0.9056, 'grad_norm': 2.270080327987671, 'learning_rate': 5.943642186934484e-06} [Rank 1] Trainer log: {'loss': 0.9056, 'grad_norm': 2.270080327987671, 'learning_rate': 5.943642186934484e-06}[Rank 0] Trainer log: {'loss': 0.9056, 'grad_norm': 2.270080327987671, 'learning_rate': 5.943642186934484e-06} {'loss': 0.9056, 'grad_norm': 2.270080327987671, 'learning_rate': 5.943642186934484e-06, 'epoch': 0.65} [Rank 0] Trainer log: {'loss': 0.7969, 'grad_norm': 3.748708963394165, 'learning_rate': 5.937410169813489e-06}[Rank 1] Trainer log: {'loss': 0.7969, 'grad_norm': 3.748708963394165, 'learning_rate': 5.937410169813489e-06} [Rank 2] Trainer log: {'loss': 0.7969, 'grad_norm': 3.748708963394165, 'learning_rate': 5.937410169813489e-06} [Rank 3] Trainer log: {'loss': 0.7969, 'grad_norm': 3.748708963394165, 'learning_rate': 5.937410169813489e-06} {'loss': 0.7969, 'grad_norm': 3.748708963394165, 'learning_rate': 5.937410169813489e-06, 'epoch': 0.65} [Rank 0] Trainer log: {'loss': 1.0037, 'grad_norm': 3.84599232673645, 'learning_rate': 5.931180041841789e-06}[Rank 2] Trainer log: {'loss': 1.0037, 'grad_norm': 3.84599232673645, 'learning_rate': 5.931180041841789e-06} [Rank 1] Trainer log: {'loss': 1.0037, 'grad_norm': 3.84599232673645, 'learning_rate': 5.931180041841789e-06} [Rank 3] Trainer log: {'loss': 1.0037, 'grad_norm': 3.84599232673645, 'learning_rate': 5.931180041841789e-06} {'loss': 1.0037, 'grad_norm': 3.84599232673645, 'learning_rate': 5.931180041841789e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.803, 'grad_norm': 2.6972970962524414, 'learning_rate': 5.924951805916468e-06}[Rank 3] Trainer log: {'loss': 0.803, 'grad_norm': 2.6972970962524414, 'learning_rate': 5.924951805916468e-06} [Rank 1] Trainer log: {'loss': 0.803, 'grad_norm': 2.6972970962524414, 'learning_rate': 5.924951805916468e-06} [Rank 0] Trainer log: {'loss': 0.803, 'grad_norm': 2.6972970962524414, 'learning_rate': 5.924951805916468e-06} {'loss': 0.803, 'grad_norm': 2.6972970962524414, 'learning_rate': 5.924951805916468e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.791, 'grad_norm': 5.352077960968018, 'learning_rate': 5.9187254649337215e-06}[Rank 3] Trainer log: {'loss': 0.791, 'grad_norm': 5.352077960968018, 'learning_rate': 5.9187254649337215e-06} [Rank 2] Trainer log: {'loss': 0.791, 'grad_norm': 5.352077960968018, 'learning_rate': 5.9187254649337215e-06} [Rank 0] Trainer log: {'loss': 0.791, 'grad_norm': 5.352077960968018, 'learning_rate': 5.9187254649337215e-06} {'loss': 0.791, 'grad_norm': 5.352077960968018, 'learning_rate': 5.9187254649337215e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 1.0093, 'grad_norm': 2.4438817501068115, 'learning_rate': 5.9125010217888654e-06} [Rank 3] Trainer log: {'loss': 1.0093, 'grad_norm': 2.4438817501068115, 'learning_rate': 5.9125010217888654e-06} [Rank 1] Trainer log: {'loss': 1.0093, 'grad_norm': 2.4438817501068115, 'learning_rate': 5.9125010217888654e-06} [Rank 0] Trainer log: {'loss': 1.0093, 'grad_norm': 2.4438817501068115, 'learning_rate': 5.9125010217888654e-06} {'loss': 1.0093, 'grad_norm': 2.4438817501068115, 'learning_rate': 5.9125010217888654e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 0.8133, 'grad_norm': 6.14702033996582, 'learning_rate': 5.906278479376335e-06}[Rank 2] Trainer log: {'loss': 0.8133, 'grad_norm': 6.14702033996582, 'learning_rate': 5.906278479376335e-06}[Rank 1] Trainer log: {'loss': 0.8133, 'grad_norm': 6.14702033996582, 'learning_rate': 5.906278479376335e-06} [Rank 0] Trainer log: {'loss': 0.8133, 'grad_norm': 6.14702033996582, 'learning_rate': 5.906278479376335e-06} {'loss': 0.8133, 'grad_norm': 6.14702033996582, 'learning_rate': 5.906278479376335e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.5765, 'grad_norm': 2.677500009536743, 'learning_rate': 5.900057840589688e-06}[Rank 1] Trainer log: {'loss': 0.5765, 'grad_norm': 2.677500009536743, 'learning_rate': 5.900057840589688e-06} [Rank 0] Trainer log: {'loss': 0.5765, 'grad_norm': 2.677500009536743, 'learning_rate': 5.900057840589688e-06}[Rank 3] Trainer log: {'loss': 0.5765, 'grad_norm': 2.677500009536743, 'learning_rate': 5.900057840589688e-06} {'loss': 0.5765, 'grad_norm': 2.677500009536743, 'learning_rate': 5.900057840589688e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 0.7614, 'grad_norm': 14.377680778503418, 'learning_rate': 5.893839108321584e-06}[Rank 2] Trainer log: {'loss': 0.7614, 'grad_norm': 14.377680778503418, 'learning_rate': 5.893839108321584e-06}[Rank 0] Trainer log: {'loss': 0.7614, 'grad_norm': 14.377680778503418, 'learning_rate': 5.893839108321584e-06} [Rank 1] Trainer log: {'loss': 0.7614, 'grad_norm': 14.377680778503418, 'learning_rate': 5.893839108321584e-06} {'loss': 0.7614, 'grad_norm': 14.377680778503418, 'learning_rate': 5.893839108321584e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 0.6305, 'grad_norm': 10.104775428771973, 'learning_rate': 5.887622285463798e-06}[Rank 2] Trainer log: {'loss': 0.6305, 'grad_norm': 10.104775428771973, 'learning_rate': 5.887622285463798e-06} [Rank 0] Trainer log: {'loss': 0.6305, 'grad_norm': 10.104775428771973, 'learning_rate': 5.887622285463798e-06} [Rank 1] Trainer log: {'loss': 0.6305, 'grad_norm': 10.104775428771973, 'learning_rate': 5.887622285463798e-06} {'loss': 0.6305, 'grad_norm': 10.104775428771973, 'learning_rate': 5.887622285463798e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.7179, 'grad_norm': 2.6785645484924316, 'learning_rate': 5.881407374907233e-06}[Rank 0] Trainer log: {'loss': 0.7179, 'grad_norm': 2.6785645484924316, 'learning_rate': 5.881407374907233e-06}[Rank 2] Trainer log: {'loss': 0.7179, 'grad_norm': 2.6785645484924316, 'learning_rate': 5.881407374907233e-06} [Rank 3] Trainer log: {'loss': 0.7179, 'grad_norm': 2.6785645484924316, 'learning_rate': 5.881407374907233e-06} {'loss': 0.7179, 'grad_norm': 2.6785645484924316, 'learning_rate': 5.881407374907233e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.7464, 'grad_norm': 1.9076381921768188, 'learning_rate': 5.875194379541885e-06} [Rank 1] Trainer log: {'loss': 0.7464, 'grad_norm': 1.9076381921768188, 'learning_rate': 5.875194379541885e-06} [Rank 0] Trainer log: {'loss': 0.7464, 'grad_norm': 1.9076381921768188, 'learning_rate': 5.875194379541885e-06}[Rank 3] Trainer log: {'loss': 0.7464, 'grad_norm': 1.9076381921768188, 'learning_rate': 5.875194379541885e-06} {'loss': 0.7464, 'grad_norm': 1.9076381921768188, 'learning_rate': 5.875194379541885e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.647, 'grad_norm': 2.5980637073516846, 'learning_rate': 5.868983302256863e-06} [Rank 3] Trainer log: {'loss': 0.647, 'grad_norm': 2.5980637073516846, 'learning_rate': 5.868983302256863e-06} [Rank 0] Trainer log: {'loss': 0.647, 'grad_norm': 2.5980637073516846, 'learning_rate': 5.868983302256863e-06}[Rank 1] Trainer log: {'loss': 0.647, 'grad_norm': 2.5980637073516846, 'learning_rate': 5.868983302256863e-06} {'loss': 0.647, 'grad_norm': 2.5980637073516846, 'learning_rate': 5.868983302256863e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 0.8, 'grad_norm': 5.024467945098877, 'learning_rate': 5.862774145940395e-06} [Rank 1] Trainer log: {'loss': 0.8, 'grad_norm': 5.024467945098877, 'learning_rate': 5.862774145940395e-06}[Rank 2] Trainer log: {'loss': 0.8, 'grad_norm': 5.024467945098877, 'learning_rate': 5.862774145940395e-06} [Rank 0] Trainer log: {'loss': 0.8, 'grad_norm': 5.024467945098877, 'learning_rate': 5.862774145940395e-06} {'loss': 0.8, 'grad_norm': 5.024467945098877, 'learning_rate': 5.862774145940395e-06, 'epoch': 0.65} [Rank 0] Trainer log: {'loss': 0.5837, 'grad_norm': 4.595396041870117, 'learning_rate': 5.8565669134798e-06}[Rank 3] Trainer log: {'loss': 0.5837, 'grad_norm': 4.595396041870117, 'learning_rate': 5.8565669134798e-06} [Rank 2] Trainer log: {'loss': 0.5837, 'grad_norm': 4.595396041870117, 'learning_rate': 5.8565669134798e-06} [Rank 1] Trainer log: {'loss': 0.5837, 'grad_norm': 4.595396041870117, 'learning_rate': 5.8565669134798e-06} {'loss': 0.5837, 'grad_norm': 4.595396041870117, 'learning_rate': 5.8565669134798e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.9345, 'grad_norm': 6.299933433532715, 'learning_rate': 5.85036160776151e-06}[Rank 3] Trainer log: {'loss': 0.9345, 'grad_norm': 6.299933433532715, 'learning_rate': 5.85036160776151e-06}[Rank 2] Trainer log: {'loss': 0.9345, 'grad_norm': 6.299933433532715, 'learning_rate': 5.85036160776151e-06} [Rank 0] Trainer log: {'loss': 0.9345, 'grad_norm': 6.299933433532715, 'learning_rate': 5.85036160776151e-06} {'loss': 0.9345, 'grad_norm': 6.299933433532715, 'learning_rate': 5.85036160776151e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 0.6912, 'grad_norm': 1.7991853952407837, 'learning_rate': 5.844158231671069e-06}[Rank 3] Trainer log: {'loss': 0.6912, 'grad_norm': 1.7991853952407837, 'learning_rate': 5.844158231671069e-06} [Rank 0] Trainer log: {'loss': 0.6912, 'grad_norm': 1.7991853952407837, 'learning_rate': 5.844158231671069e-06}[Rank 2] Trainer log: {'loss': 0.6912, 'grad_norm': 1.7991853952407837, 'learning_rate': 5.844158231671069e-06} {'loss': 0.6912, 'grad_norm': 1.7991853952407837, 'learning_rate': 5.844158231671069e-06, 'epoch': 0.65} [Rank 1] Trainer log: {'loss': 1.0108, 'grad_norm': 3.993945598602295, 'learning_rate': 5.837956788093107e-06} [Rank 3] Trainer log: {'loss': 1.0108, 'grad_norm': 3.993945598602295, 'learning_rate': 5.837956788093107e-06} [Rank 0] Trainer log: {'loss': 1.0108, 'grad_norm': 3.993945598602295, 'learning_rate': 5.837956788093107e-06}[Rank 2] Trainer log: {'loss': 1.0108, 'grad_norm': 3.993945598602295, 'learning_rate': 5.837956788093107e-06} {'loss': 1.0108, 'grad_norm': 3.993945598602295, 'learning_rate': 5.837956788093107e-06, 'epoch': 0.65} [Rank 3] Trainer log: {'loss': 0.7745, 'grad_norm': 4.714572906494141, 'learning_rate': 5.831757279911365e-06}[Rank 2] Trainer log: {'loss': 0.7745, 'grad_norm': 4.714572906494141, 'learning_rate': 5.831757279911365e-06} [Rank 1] Trainer log: {'loss': 0.7745, 'grad_norm': 4.714572906494141, 'learning_rate': 5.831757279911365e-06} [Rank 0] Trainer log: {'loss': 0.7745, 'grad_norm': 4.714572906494141, 'learning_rate': 5.831757279911365e-06} {'loss': 0.7745, 'grad_norm': 4.714572906494141, 'learning_rate': 5.831757279911365e-06, 'epoch': 0.65} [Rank 2] Trainer log: {'loss': 0.8246, 'grad_norm': 2.178642749786377, 'learning_rate': 5.825559710008688e-06}[Rank 1] Trainer log: {'loss': 0.8246, 'grad_norm': 2.178642749786377, 'learning_rate': 5.825559710008688e-06} [Rank 3] Trainer log: {'loss': 0.8246, 'grad_norm': 2.178642749786377, 'learning_rate': 5.825559710008688e-06} [Rank 0] Trainer log: {'loss': 0.8246, 'grad_norm': 2.178642749786377, 'learning_rate': 5.825559710008688e-06} {'loss': 0.8246, 'grad_norm': 2.178642749786377, 'learning_rate': 5.825559710008688e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.9382, 'grad_norm': 4.0292158126831055, 'learning_rate': 5.819364081267014e-06}[Rank 3] Trainer log: {'loss': 0.9382, 'grad_norm': 4.0292158126831055, 'learning_rate': 5.819364081267014e-06}[Rank 1] Trainer log: {'loss': 0.9382, 'grad_norm': 4.0292158126831055, 'learning_rate': 5.819364081267014e-06} [Rank 0] Trainer log: {'loss': 0.9382, 'grad_norm': 4.0292158126831055, 'learning_rate': 5.819364081267014e-06} {'loss': 0.9382, 'grad_norm': 4.0292158126831055, 'learning_rate': 5.819364081267014e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.7945, 'grad_norm': 15.364354133605957, 'learning_rate': 5.813170396567372e-06} [Rank 3] Trainer log: {'loss': 0.7945, 'grad_norm': 15.364354133605957, 'learning_rate': 5.813170396567372e-06}[Rank 0] Trainer log: {'loss': 0.7945, 'grad_norm': 15.364354133605957, 'learning_rate': 5.813170396567372e-06}[Rank 1] Trainer log: {'loss': 0.7945, 'grad_norm': 15.364354133605957, 'learning_rate': 5.813170396567372e-06} {'loss': 0.7945, 'grad_norm': 15.364354133605957, 'learning_rate': 5.813170396567372e-06, 'epoch': 0.66} [Rank 0] Trainer log: {'loss': 0.6708, 'grad_norm': 3.627948760986328, 'learning_rate': 5.806978658789901e-06}[Rank 3] Trainer log: {'loss': 0.6708, 'grad_norm': 3.627948760986328, 'learning_rate': 5.806978658789901e-06} [Rank 2] Trainer log: {'loss': 0.6708, 'grad_norm': 3.627948760986328, 'learning_rate': 5.806978658789901e-06} [Rank 1] Trainer log: {'loss': 0.6708, 'grad_norm': 3.627948760986328, 'learning_rate': 5.806978658789901e-06} {'loss': 0.6708, 'grad_norm': 3.627948760986328, 'learning_rate': 5.806978658789901e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.7045, 'grad_norm': 4.504464149475098, 'learning_rate': 5.800788870813827e-06}[Rank 2] Trainer log: {'loss': 0.7045, 'grad_norm': 4.504464149475098, 'learning_rate': 5.800788870813827e-06}[Rank 3] Trainer log: {'loss': 0.7045, 'grad_norm': 4.504464149475098, 'learning_rate': 5.800788870813827e-06} [Rank 0] Trainer log: {'loss': 0.7045, 'grad_norm': 4.504464149475098, 'learning_rate': 5.800788870813827e-06} {'loss': 0.7045, 'grad_norm': 4.504464149475098, 'learning_rate': 5.800788870813827e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.9814, 'grad_norm': 2.1649351119995117, 'learning_rate': 5.794601035517465e-06} [Rank 3] Trainer log: {'loss': 0.9814, 'grad_norm': 2.1649351119995117, 'learning_rate': 5.794601035517465e-06} [Rank 1] Trainer log: {'loss': 0.9814, 'grad_norm': 2.1649351119995117, 'learning_rate': 5.794601035517465e-06} [Rank 0] Trainer log: {'loss': 0.9814, 'grad_norm': 2.1649351119995117, 'learning_rate': 5.794601035517465e-06} {'loss': 0.9814, 'grad_norm': 2.1649351119995117, 'learning_rate': 5.794601035517465e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.8405, 'grad_norm': 11.242415428161621, 'learning_rate': 5.7884151557782305e-06}[Rank 2] Trainer log: {'loss': 0.8405, 'grad_norm': 11.242415428161621, 'learning_rate': 5.7884151557782305e-06}[Rank 3] Trainer log: {'loss': 0.8405, 'grad_norm': 11.242415428161621, 'learning_rate': 5.7884151557782305e-06} [Rank 0] Trainer log: {'loss': 0.8405, 'grad_norm': 11.242415428161621, 'learning_rate': 5.7884151557782305e-06} {'loss': 0.8405, 'grad_norm': 11.242415428161621, 'learning_rate': 5.7884151557782305e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.9447, 'grad_norm': 3.675645589828491, 'learning_rate': 5.78223123447263e-06}[Rank 2] Trainer log: {'loss': 0.9447, 'grad_norm': 3.675645589828491, 'learning_rate': 5.78223123447263e-06}[Rank 0] Trainer log: {'loss': 0.9447, 'grad_norm': 3.675645589828491, 'learning_rate': 5.78223123447263e-06} [Rank 3] Trainer log: {'loss': 0.9447, 'grad_norm': 3.675645589828491, 'learning_rate': 5.78223123447263e-06} {'loss': 0.9447, 'grad_norm': 3.675645589828491, 'learning_rate': 5.78223123447263e-06, 'epoch': 0.66} [Rank 3] Trainer log: {'loss': 0.771, 'grad_norm': 5.765727519989014, 'learning_rate': 5.776049274476248e-06}[Rank 2] Trainer log: {'loss': 0.771, 'grad_norm': 5.765727519989014, 'learning_rate': 5.776049274476248e-06}[Rank 1] Trainer log: {'loss': 0.771, 'grad_norm': 5.765727519989014, 'learning_rate': 5.776049274476248e-06} [Rank 0] Trainer log: {'loss': 0.771, 'grad_norm': 5.765727519989014, 'learning_rate': 5.776049274476248e-06} {'loss': 0.771, 'grad_norm': 5.765727519989014, 'learning_rate': 5.776049274476248e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.8277, 'grad_norm': 2.1863696575164795, 'learning_rate': 5.769869278663771e-06} [Rank 0] Trainer log: {'loss': 0.8277, 'grad_norm': 2.1863696575164795, 'learning_rate': 5.769869278663771e-06} [Rank 3] Trainer log: {'loss': 0.8277, 'grad_norm': 2.1863696575164795, 'learning_rate': 5.769869278663771e-06} [Rank 1] Trainer log: {'loss': 0.8277, 'grad_norm': 2.1863696575164795, 'learning_rate': 5.769869278663771e-06} {'loss': 0.8277, 'grad_norm': 2.1863696575164795, 'learning_rate': 5.769869278663771e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.9307, 'grad_norm': 13.87835693359375, 'learning_rate': 5.763691249908962e-06}[Rank 3] Trainer log: {'loss': 0.9307, 'grad_norm': 13.87835693359375, 'learning_rate': 5.763691249908962e-06} [Rank 1] Trainer log: {'loss': 0.9307, 'grad_norm': 13.87835693359375, 'learning_rate': 5.763691249908962e-06} [Rank 0] Trainer log: {'loss': 0.9307, 'grad_norm': 13.87835693359375, 'learning_rate': 5.763691249908962e-06} {'loss': 0.9307, 'grad_norm': 13.87835693359375, 'learning_rate': 5.763691249908962e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.749, 'grad_norm': 4.8269944190979, 'learning_rate': 5.757515191084675e-06} [Rank 0] Trainer log: {'loss': 0.749, 'grad_norm': 4.8269944190979, 'learning_rate': 5.757515191084675e-06}[Rank 3] Trainer log: {'loss': 0.749, 'grad_norm': 4.8269944190979, 'learning_rate': 5.757515191084675e-06} [Rank 2] Trainer log: {'loss': 0.749, 'grad_norm': 4.8269944190979, 'learning_rate': 5.757515191084675e-06} {'loss': 0.749, 'grad_norm': 4.8269944190979, 'learning_rate': 5.757515191084675e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.7878, 'grad_norm': 5.80725622177124, 'learning_rate': 5.7513411050628395e-06}[Rank 1] Trainer log: {'loss': 0.7878, 'grad_norm': 5.80725622177124, 'learning_rate': 5.7513411050628395e-06} [Rank 3] Trainer log: {'loss': 0.7878, 'grad_norm': 5.80725622177124, 'learning_rate': 5.7513411050628395e-06} [Rank 0] Trainer log: {'loss': 0.7878, 'grad_norm': 5.80725622177124, 'learning_rate': 5.7513411050628395e-06} {'loss': 0.7878, 'grad_norm': 5.80725622177124, 'learning_rate': 5.7513411050628395e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.563, 'grad_norm': 2.805943489074707, 'learning_rate': 5.745168994714483e-06}[Rank 2] Trainer log: {'loss': 0.563, 'grad_norm': 2.805943489074707, 'learning_rate': 5.745168994714483e-06} [Rank 0] Trainer log: {'loss': 0.563, 'grad_norm': 2.805943489074707, 'learning_rate': 5.745168994714483e-06}[Rank 3] Trainer log: {'loss': 0.563, 'grad_norm': 2.805943489074707, 'learning_rate': 5.745168994714483e-06} {'loss': 0.563, 'grad_norm': 2.805943489074707, 'learning_rate': 5.745168994714483e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 1.1021, 'grad_norm': 4.076926231384277, 'learning_rate': 5.738998862909696e-06}[Rank 3] Trainer log: {'loss': 1.1021, 'grad_norm': 4.076926231384277, 'learning_rate': 5.738998862909696e-06} [Rank 1] Trainer log: {'loss': 1.1021, 'grad_norm': 4.076926231384277, 'learning_rate': 5.738998862909696e-06} [Rank 0] Trainer log: {'loss': 1.1021, 'grad_norm': 4.076926231384277, 'learning_rate': 5.738998862909696e-06} {'loss': 1.1021, 'grad_norm': 4.076926231384277, 'learning_rate': 5.738998862909696e-06, 'epoch': 0.66} [Rank 3] Trainer log: {'loss': 0.757, 'grad_norm': 2.2050209045410156, 'learning_rate': 5.732830712517663e-06}[Rank 1] Trainer log: {'loss': 0.757, 'grad_norm': 2.2050209045410156, 'learning_rate': 5.732830712517663e-06}[Rank 2] Trainer log: {'loss': 0.757, 'grad_norm': 2.2050209045410156, 'learning_rate': 5.732830712517663e-06} [Rank 0] Trainer log: {'loss': 0.757, 'grad_norm': 2.2050209045410156, 'learning_rate': 5.732830712517663e-06} {'loss': 0.757, 'grad_norm': 2.2050209045410156, 'learning_rate': 5.732830712517663e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.8286, 'grad_norm': 4.614724159240723, 'learning_rate': 5.726664546406648e-06}[Rank 3] Trainer log: {'loss': 0.8286, 'grad_norm': 4.614724159240723, 'learning_rate': 5.726664546406648e-06} [Rank 1] Trainer log: {'loss': 0.8286, 'grad_norm': 4.614724159240723, 'learning_rate': 5.726664546406648e-06} [Rank 0] Trainer log: {'loss': 0.8286, 'grad_norm': 4.614724159240723, 'learning_rate': 5.726664546406648e-06} {'loss': 0.8286, 'grad_norm': 4.614724159240723, 'learning_rate': 5.726664546406648e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.9618, 'grad_norm': 7.22453498840332, 'learning_rate': 5.72050036744398e-06}[Rank 3] Trainer log: {'loss': 0.9618, 'grad_norm': 7.22453498840332, 'learning_rate': 5.72050036744398e-06} [Rank 0] Trainer log: {'loss': 0.9618, 'grad_norm': 7.22453498840332, 'learning_rate': 5.72050036744398e-06}[Rank 1] Trainer log: {'loss': 0.9618, 'grad_norm': 7.22453498840332, 'learning_rate': 5.72050036744398e-06} {'loss': 0.9618, 'grad_norm': 7.22453498840332, 'learning_rate': 5.72050036744398e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.9573, 'grad_norm': 4.553210258483887, 'learning_rate': 5.714338178496067e-06} [Rank 3] Trainer log: {'loss': 0.9573, 'grad_norm': 4.553210258483887, 'learning_rate': 5.714338178496067e-06} [Rank 0] Trainer log: {'loss': 0.9573, 'grad_norm': 4.553210258483887, 'learning_rate': 5.714338178496067e-06}[Rank 1] Trainer log: {'loss': 0.9573, 'grad_norm': 4.553210258483887, 'learning_rate': 5.714338178496067e-06} {'loss': 0.9573, 'grad_norm': 4.553210258483887, 'learning_rate': 5.714338178496067e-06, 'epoch': 0.66} [Rank 3] Trainer log: {'loss': 0.8592, 'grad_norm': 4.1339111328125, 'learning_rate': 5.7081779824284045e-06} [Rank 0] Trainer log: {'loss': 0.8592, 'grad_norm': 4.1339111328125, 'learning_rate': 5.7081779824284045e-06}[Rank 1] Trainer log: {'loss': 0.8592, 'grad_norm': 4.1339111328125, 'learning_rate': 5.7081779824284045e-06}[Rank 2] Trainer log: {'loss': 0.8592, 'grad_norm': 4.1339111328125, 'learning_rate': 5.7081779824284045e-06} {'loss': 0.8592, 'grad_norm': 4.1339111328125, 'learning_rate': 5.7081779824284045e-06, 'epoch': 0.66} [Rank 0] Trainer log: {'loss': 0.7665, 'grad_norm': 7.088177680969238, 'learning_rate': 5.702019782105546e-06}[Rank 2] Trainer log: {'loss': 0.7665, 'grad_norm': 7.088177680969238, 'learning_rate': 5.702019782105546e-06} [Rank 3] Trainer log: {'loss': 0.7665, 'grad_norm': 7.088177680969238, 'learning_rate': 5.702019782105546e-06} [Rank 1] Trainer log: {'loss': 0.7665, 'grad_norm': 7.088177680969238, 'learning_rate': 5.702019782105546e-06} {'loss': 0.7665, 'grad_norm': 7.088177680969238, 'learning_rate': 5.702019782105546e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.8554, 'grad_norm': 5.789768695831299, 'learning_rate': 5.695863580391121e-06}[Rank 2] Trainer log: {'loss': 0.8554, 'grad_norm': 5.789768695831299, 'learning_rate': 5.695863580391121e-06} [Rank 3] Trainer log: {'loss': 0.8554, 'grad_norm': 5.789768695831299, 'learning_rate': 5.695863580391121e-06} [Rank 0] Trainer log: {'loss': 0.8554, 'grad_norm': 5.789768695831299, 'learning_rate': 5.695863580391121e-06} {'loss': 0.8554, 'grad_norm': 5.789768695831299, 'learning_rate': 5.695863580391121e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.596, 'grad_norm': 5.944942474365234, 'learning_rate': 5.689709380147837e-06} [Rank 3] Trainer log: {'loss': 0.596, 'grad_norm': 5.944942474365234, 'learning_rate': 5.689709380147837e-06}[Rank 2] Trainer log: {'loss': 0.596, 'grad_norm': 5.944942474365234, 'learning_rate': 5.689709380147837e-06} [Rank 0] Trainer log: {'loss': 0.596, 'grad_norm': 5.944942474365234, 'learning_rate': 5.689709380147837e-06} {'loss': 0.596, 'grad_norm': 5.944942474365234, 'learning_rate': 5.689709380147837e-06, 'epoch': 0.66} [Rank 3] Trainer log: {'loss': 0.8711, 'grad_norm': 2.4978461265563965, 'learning_rate': 5.68355718423746e-06} [Rank 2] Trainer log: {'loss': 0.8711, 'grad_norm': 2.4978461265563965, 'learning_rate': 5.68355718423746e-06} [Rank 1] Trainer log: {'loss': 0.8711, 'grad_norm': 2.4978461265563965, 'learning_rate': 5.68355718423746e-06} [Rank 0] Trainer log: {'loss': 0.8711, 'grad_norm': 2.4978461265563965, 'learning_rate': 5.68355718423746e-06} {'loss': 0.8711, 'grad_norm': 2.4978461265563965, 'learning_rate': 5.68355718423746e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 1.0302, 'grad_norm': 5.07874059677124, 'learning_rate': 5.677406995520832e-06}[Rank 3] Trainer log: {'loss': 1.0302, 'grad_norm': 5.07874059677124, 'learning_rate': 5.677406995520832e-06}[Rank 2] Trainer log: {'loss': 1.0302, 'grad_norm': 5.07874059677124, 'learning_rate': 5.677406995520832e-06} [Rank 0] Trainer log: {'loss': 1.0302, 'grad_norm': 5.07874059677124, 'learning_rate': 5.677406995520832e-06} {'loss': 1.0302, 'grad_norm': 5.07874059677124, 'learning_rate': 5.677406995520832e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.928, 'grad_norm': 3.810596227645874, 'learning_rate': 5.671258816857863e-06}[Rank 1] Trainer log: {'loss': 0.928, 'grad_norm': 3.810596227645874, 'learning_rate': 5.671258816857863e-06} [Rank 0] Trainer log: {'loss': 0.928, 'grad_norm': 3.810596227645874, 'learning_rate': 5.671258816857863e-06}[Rank 3] Trainer log: {'loss': 0.928, 'grad_norm': 3.810596227645874, 'learning_rate': 5.671258816857863e-06} {'loss': 0.928, 'grad_norm': 3.810596227645874, 'learning_rate': 5.671258816857863e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.6054, 'grad_norm': 8.63343620300293, 'learning_rate': 5.66511265110752e-06}[Rank 0] Trainer log: {'loss': 0.6054, 'grad_norm': 8.63343620300293, 'learning_rate': 5.66511265110752e-06} [Rank 1] Trainer log: {'loss': 0.6054, 'grad_norm': 8.63343620300293, 'learning_rate': 5.66511265110752e-06}[Rank 3] Trainer log: {'loss': 0.6054, 'grad_norm': 8.63343620300293, 'learning_rate': 5.66511265110752e-06} {'loss': 0.6054, 'grad_norm': 8.63343620300293, 'learning_rate': 5.66511265110752e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.7133, 'grad_norm': 2.6178202629089355, 'learning_rate': 5.658968501127835e-06} [Rank 2] Trainer log: {'loss': 0.7133, 'grad_norm': 2.6178202629089355, 'learning_rate': 5.658968501127835e-06} [Rank 0] Trainer log: {'loss': 0.7133, 'grad_norm': 2.6178202629089355, 'learning_rate': 5.658968501127835e-06} [Rank 3] Trainer log: {'loss': 0.7133, 'grad_norm': 2.6178202629089355, 'learning_rate': 5.658968501127835e-06} {'loss': 0.7133, 'grad_norm': 2.6178202629089355, 'learning_rate': 5.658968501127835e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.7397, 'grad_norm': 9.787893295288086, 'learning_rate': 5.652826369775912e-06}[Rank 1] Trainer log: {'loss': 0.7397, 'grad_norm': 9.787893295288086, 'learning_rate': 5.652826369775912e-06}[Rank 3] Trainer log: {'loss': 0.7397, 'grad_norm': 9.787893295288086, 'learning_rate': 5.652826369775912e-06} [Rank 0] Trainer log: {'loss': 0.7397, 'grad_norm': 9.787893295288086, 'learning_rate': 5.652826369775912e-06} {'loss': 0.7397, 'grad_norm': 9.787893295288086, 'learning_rate': 5.652826369775912e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.6245, 'grad_norm': 9.020448684692383, 'learning_rate': 5.646686259907907e-06}[Rank 1] Trainer log: {'loss': 0.6245, 'grad_norm': 9.020448684692383, 'learning_rate': 5.646686259907907e-06}[Rank 3] Trainer log: {'loss': 0.6245, 'grad_norm': 9.020448684692383, 'learning_rate': 5.646686259907907e-06} [Rank 0] Trainer log: {'loss': 0.6245, 'grad_norm': 9.020448684692383, 'learning_rate': 5.646686259907907e-06} {'loss': 0.6245, 'grad_norm': 9.020448684692383, 'learning_rate': 5.646686259907907e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 1.0149, 'grad_norm': 10.937383651733398, 'learning_rate': 5.640548174379038e-06}[Rank 1] Trainer log: {'loss': 1.0149, 'grad_norm': 10.937383651733398, 'learning_rate': 5.640548174379038e-06} [Rank 0] Trainer log: {'loss': 1.0149, 'grad_norm': 10.937383651733398, 'learning_rate': 5.640548174379038e-06}[Rank 3] Trainer log: {'loss': 1.0149, 'grad_norm': 10.937383651733398, 'learning_rate': 5.640548174379038e-06} {'loss': 1.0149, 'grad_norm': 10.937383651733398, 'learning_rate': 5.640548174379038e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.8328, 'grad_norm': 7.032098293304443, 'learning_rate': 5.634412116043588e-06} [Rank 1] Trainer log: {'loss': 0.8328, 'grad_norm': 7.032098293304443, 'learning_rate': 5.634412116043588e-06} [Rank 3] Trainer log: {'loss': 0.8328, 'grad_norm': 7.032098293304443, 'learning_rate': 5.634412116043588e-06} [Rank 0] Trainer log: {'loss': 0.8328, 'grad_norm': 7.032098293304443, 'learning_rate': 5.634412116043588e-06} {'loss': 0.8328, 'grad_norm': 7.032098293304443, 'learning_rate': 5.634412116043588e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 1.0115, 'grad_norm': 2.4389801025390625, 'learning_rate': 5.628278087754882e-06}[Rank 2] Trainer log: {'loss': 1.0115, 'grad_norm': 2.4389801025390625, 'learning_rate': 5.628278087754882e-06} [Rank 3] Trainer log: {'loss': 1.0115, 'grad_norm': 2.4389801025390625, 'learning_rate': 5.628278087754882e-06} [Rank 0] Trainer log: {'loss': 1.0115, 'grad_norm': 2.4389801025390625, 'learning_rate': 5.628278087754882e-06} {'loss': 1.0115, 'grad_norm': 2.4389801025390625, 'learning_rate': 5.628278087754882e-06, 'epoch': 0.66} [Rank 3] Trainer log: {'loss': 1.0028, 'grad_norm': 4.462586402893066, 'learning_rate': 5.622146092365319e-06}[Rank 2] Trainer log: {'loss': 1.0028, 'grad_norm': 4.462586402893066, 'learning_rate': 5.622146092365319e-06} [Rank 0] Trainer log: {'loss': 1.0028, 'grad_norm': 4.462586402893066, 'learning_rate': 5.622146092365319e-06}[Rank 1] Trainer log: {'loss': 1.0028, 'grad_norm': 4.462586402893066, 'learning_rate': 5.622146092365319e-06} {'loss': 1.0028, 'grad_norm': 4.462586402893066, 'learning_rate': 5.622146092365319e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.762, 'grad_norm': 9.13481616973877, 'learning_rate': 5.616016132726348e-06} [Rank 0] Trainer log: {'loss': 0.762, 'grad_norm': 9.13481616973877, 'learning_rate': 5.616016132726348e-06}[Rank 1] Trainer log: {'loss': 0.762, 'grad_norm': 9.13481616973877, 'learning_rate': 5.616016132726348e-06} [Rank 3] Trainer log: {'loss': 0.762, 'grad_norm': 9.13481616973877, 'learning_rate': 5.616016132726348e-06} {'loss': 0.762, 'grad_norm': 9.13481616973877, 'learning_rate': 5.616016132726348e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.8399, 'grad_norm': 6.659610748291016, 'learning_rate': 5.609888211688463e-06} [Rank 1] Trainer log: {'loss': 0.8399, 'grad_norm': 6.659610748291016, 'learning_rate': 5.609888211688463e-06} [Rank 3] Trainer log: {'loss': 0.8399, 'grad_norm': 6.659610748291016, 'learning_rate': 5.609888211688463e-06} [Rank 0] Trainer log: {'loss': 0.8399, 'grad_norm': 6.659610748291016, 'learning_rate': 5.609888211688463e-06} {'loss': 0.8399, 'grad_norm': 6.659610748291016, 'learning_rate': 5.609888211688463e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.6043, 'grad_norm': 12.113874435424805, 'learning_rate': 5.603762332101217e-06}[Rank 2] Trainer log: {'loss': 0.6043, 'grad_norm': 12.113874435424805, 'learning_rate': 5.603762332101217e-06} [Rank 0] Trainer log: {'loss': 0.6043, 'grad_norm': 12.113874435424805, 'learning_rate': 5.603762332101217e-06}[Rank 3] Trainer log: {'loss': 0.6043, 'grad_norm': 12.113874435424805, 'learning_rate': 5.603762332101217e-06} {'loss': 0.6043, 'grad_norm': 12.113874435424805, 'learning_rate': 5.603762332101217e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.8516, 'grad_norm': 3.1041598320007324, 'learning_rate': 5.5976384968132044e-06}[Rank 1] Trainer log: {'loss': 0.8516, 'grad_norm': 3.1041598320007324, 'learning_rate': 5.5976384968132044e-06} [Rank 3] Trainer log: {'loss': 0.8516, 'grad_norm': 3.1041598320007324, 'learning_rate': 5.5976384968132044e-06} [Rank 0] Trainer log: {'loss': 0.8516, 'grad_norm': 3.1041598320007324, 'learning_rate': 5.5976384968132044e-06} {'loss': 0.8516, 'grad_norm': 3.1041598320007324, 'learning_rate': 5.5976384968132044e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.8303, 'grad_norm': 20.15412712097168, 'learning_rate': 5.591516708672089e-06}[Rank 3] Trainer log: {'loss': 0.8303, 'grad_norm': 20.15412712097168, 'learning_rate': 5.591516708672089e-06} [Rank 1] Trainer log: {'loss': 0.8303, 'grad_norm': 20.15412712097168, 'learning_rate': 5.591516708672089e-06} [Rank 0] Trainer log: {'loss': 0.8303, 'grad_norm': 20.15412712097168, 'learning_rate': 5.591516708672089e-06} {'loss': 0.8303, 'grad_norm': 20.15412712097168, 'learning_rate': 5.591516708672089e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.9868, 'grad_norm': 5.803197860717773, 'learning_rate': 5.58539697052456e-06}[Rank 1] Trainer log: {'loss': 0.9868, 'grad_norm': 5.803197860717773, 'learning_rate': 5.58539697052456e-06}[Rank 3] Trainer log: {'loss': 0.9868, 'grad_norm': 5.803197860717773, 'learning_rate': 5.58539697052456e-06} [Rank 0] Trainer log: {'loss': 0.9868, 'grad_norm': 5.803197860717773, 'learning_rate': 5.58539697052456e-06} {'loss': 0.9868, 'grad_norm': 5.803197860717773, 'learning_rate': 5.58539697052456e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.8171, 'grad_norm': 2.524265766143799, 'learning_rate': 5.579279285216369e-06}[Rank 1] Trainer log: {'loss': 0.8171, 'grad_norm': 2.524265766143799, 'learning_rate': 5.579279285216369e-06} [Rank 3] Trainer log: {'loss': 0.8171, 'grad_norm': 2.524265766143799, 'learning_rate': 5.579279285216369e-06} [Rank 0] Trainer log: {'loss': 0.8171, 'grad_norm': 2.524265766143799, 'learning_rate': 5.579279285216369e-06} {'loss': 0.8171, 'grad_norm': 2.524265766143799, 'learning_rate': 5.579279285216369e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 1.0282, 'grad_norm': 7.4124298095703125, 'learning_rate': 5.573163655592304e-06}[Rank 2] Trainer log: {'loss': 1.0282, 'grad_norm': 7.4124298095703125, 'learning_rate': 5.573163655592304e-06}[Rank 3] Trainer log: {'loss': 1.0282, 'grad_norm': 7.4124298095703125, 'learning_rate': 5.573163655592304e-06} [Rank 0] Trainer log: {'loss': 1.0282, 'grad_norm': 7.4124298095703125, 'learning_rate': 5.573163655592304e-06} {'loss': 1.0282, 'grad_norm': 7.4124298095703125, 'learning_rate': 5.573163655592304e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.9071, 'grad_norm': 5.749579906463623, 'learning_rate': 5.567050084496206e-06}[Rank 3] Trainer log: {'loss': 0.9071, 'grad_norm': 5.749579906463623, 'learning_rate': 5.567050084496206e-06}[Rank 0] Trainer log: {'loss': 0.9071, 'grad_norm': 5.749579906463623, 'learning_rate': 5.567050084496206e-06} [Rank 2] Trainer log: {'loss': 0.9071, 'grad_norm': 5.749579906463623, 'learning_rate': 5.567050084496206e-06} {'loss': 0.9071, 'grad_norm': 5.749579906463623, 'learning_rate': 5.567050084496206e-06, 'epoch': 0.66} [Rank 3] Trainer log: {'loss': 1.0282, 'grad_norm': 2.2232775688171387, 'learning_rate': 5.560938574770945e-06}[Rank 2] Trainer log: {'loss': 1.0282, 'grad_norm': 2.2232775688171387, 'learning_rate': 5.560938574770945e-06} [Rank 0] Trainer log: {'loss': 1.0282, 'grad_norm': 2.2232775688171387, 'learning_rate': 5.560938574770945e-06}[Rank 1] Trainer log: {'loss': 1.0282, 'grad_norm': 2.2232775688171387, 'learning_rate': 5.560938574770945e-06} {'loss': 1.0282, 'grad_norm': 2.2232775688171387, 'learning_rate': 5.560938574770945e-06, 'epoch': 0.66} [Rank 3] Trainer log: {'loss': 1.0679, 'grad_norm': 7.144684314727783, 'learning_rate': 5.55482912925845e-06}[Rank 2] Trainer log: {'loss': 1.0679, 'grad_norm': 7.144684314727783, 'learning_rate': 5.55482912925845e-06} [Rank 1] Trainer log: {'loss': 1.0679, 'grad_norm': 7.144684314727783, 'learning_rate': 5.55482912925845e-06} [Rank 0] Trainer log: {'loss': 1.0679, 'grad_norm': 7.144684314727783, 'learning_rate': 5.55482912925845e-06} {'loss': 1.0679, 'grad_norm': 7.144684314727783, 'learning_rate': 5.55482912925845e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.9185, 'grad_norm': 2.3819093704223633, 'learning_rate': 5.548721750799676e-06} [Rank 1] Trainer log: {'loss': 0.9185, 'grad_norm': 2.3819093704223633, 'learning_rate': 5.548721750799676e-06} [Rank 3] Trainer log: {'loss': 0.9185, 'grad_norm': 2.3819093704223633, 'learning_rate': 5.548721750799676e-06}[Rank 0] Trainer log: {'loss': 0.9185, 'grad_norm': 2.3819093704223633, 'learning_rate': 5.548721750799676e-06} {'loss': 0.9185, 'grad_norm': 2.3819093704223633, 'learning_rate': 5.548721750799676e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.8868, 'grad_norm': 3.9080581665039062, 'learning_rate': 5.542616442234618e-06}[Rank 3] Trainer log: {'loss': 0.8868, 'grad_norm': 3.9080581665039062, 'learning_rate': 5.542616442234618e-06} [Rank 2] Trainer log: {'loss': 0.8868, 'grad_norm': 3.9080581665039062, 'learning_rate': 5.542616442234618e-06} [Rank 0] Trainer log: {'loss': 0.8868, 'grad_norm': 3.9080581665039062, 'learning_rate': 5.542616442234618e-06} {'loss': 0.8868, 'grad_norm': 3.9080581665039062, 'learning_rate': 5.542616442234618e-06, 'epoch': 0.66} [Rank 2] Trainer log: {'loss': 0.5347, 'grad_norm': 10.667551040649414, 'learning_rate': 5.5365132064023204e-06}[Rank 1] Trainer log: {'loss': 0.5347, 'grad_norm': 10.667551040649414, 'learning_rate': 5.5365132064023204e-06}[Rank 0] Trainer log: {'loss': 0.5347, 'grad_norm': 10.667551040649414, 'learning_rate': 5.5365132064023204e-06} [Rank 3] Trainer log: {'loss': 0.5347, 'grad_norm': 10.667551040649414, 'learning_rate': 5.5365132064023204e-06} {'loss': 0.5347, 'grad_norm': 10.667551040649414, 'learning_rate': 5.5365132064023204e-06, 'epoch': 0.66} [Rank 1] Trainer log: {'loss': 0.826, 'grad_norm': 6.097390174865723, 'learning_rate': 5.5304120461408475e-06}[Rank 3] Trainer log: {'loss': 0.826, 'grad_norm': 6.097390174865723, 'learning_rate': 5.5304120461408475e-06} [Rank 0] Trainer log: {'loss': 0.826, 'grad_norm': 6.097390174865723, 'learning_rate': 5.5304120461408475e-06} [Rank 2] Trainer log: {'loss': 0.826, 'grad_norm': 6.097390174865723, 'learning_rate': 5.5304120461408475e-06} {'loss': 0.826, 'grad_norm': 6.097390174865723, 'learning_rate': 5.5304120461408475e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.9536, 'grad_norm': 4.665475845336914, 'learning_rate': 5.524312964287311e-06}[Rank 3] Trainer log: {'loss': 0.9536, 'grad_norm': 4.665475845336914, 'learning_rate': 5.524312964287311e-06}[Rank 2] Trainer log: {'loss': 0.9536, 'grad_norm': 4.665475845336914, 'learning_rate': 5.524312964287311e-06} [Rank 0] Trainer log: {'loss': 0.9536, 'grad_norm': 4.665475845336914, 'learning_rate': 5.524312964287311e-06} {'loss': 0.9536, 'grad_norm': 4.665475845336914, 'learning_rate': 5.524312964287311e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 0.7613, 'grad_norm': 8.371034622192383, 'learning_rate': 5.518215963677853e-06}[Rank 2] Trainer log: {'loss': 0.7613, 'grad_norm': 8.371034622192383, 'learning_rate': 5.518215963677853e-06} [Rank 1] Trainer log: {'loss': 0.7613, 'grad_norm': 8.371034622192383, 'learning_rate': 5.518215963677853e-06} [Rank 0] Trainer log: {'loss': 0.7613, 'grad_norm': 8.371034622192383, 'learning_rate': 5.518215963677853e-06} {'loss': 0.7613, 'grad_norm': 8.371034622192383, 'learning_rate': 5.518215963677853e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.955, 'grad_norm': 3.633880376815796, 'learning_rate': 5.5121210471476445e-06} [Rank 3] Trainer log: {'loss': 0.955, 'grad_norm': 3.633880376815796, 'learning_rate': 5.5121210471476445e-06} [Rank 1] Trainer log: {'loss': 0.955, 'grad_norm': 3.633880376815796, 'learning_rate': 5.5121210471476445e-06} [Rank 0] Trainer log: {'loss': 0.955, 'grad_norm': 3.633880376815796, 'learning_rate': 5.5121210471476445e-06} {'loss': 0.955, 'grad_norm': 3.633880376815796, 'learning_rate': 5.5121210471476445e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 1.013, 'grad_norm': 1.530562162399292, 'learning_rate': 5.506028217530887e-06}[Rank 2] Trainer log: {'loss': 1.013, 'grad_norm': 1.530562162399292, 'learning_rate': 5.506028217530887e-06}[Rank 1] Trainer log: {'loss': 1.013, 'grad_norm': 1.530562162399292, 'learning_rate': 5.506028217530887e-06} [Rank 0] Trainer log: {'loss': 1.013, 'grad_norm': 1.530562162399292, 'learning_rate': 5.506028217530887e-06} {'loss': 1.013, 'grad_norm': 1.530562162399292, 'learning_rate': 5.506028217530887e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 0.6136, 'grad_norm': 10.696940422058105, 'learning_rate': 5.499937477660817e-06} [Rank 2] Trainer log: {'loss': 0.6136, 'grad_norm': 10.696940422058105, 'learning_rate': 5.499937477660817e-06} [Rank 1] Trainer log: {'loss': 0.6136, 'grad_norm': 10.696940422058105, 'learning_rate': 5.499937477660817e-06} [Rank 0] Trainer log: {'loss': 0.6136, 'grad_norm': 10.696940422058105, 'learning_rate': 5.499937477660817e-06} {'loss': 0.6136, 'grad_norm': 10.696940422058105, 'learning_rate': 5.499937477660817e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.7408, 'grad_norm': 3.9974398612976074, 'learning_rate': 5.493848830369697e-06}[Rank 2] Trainer log: {'loss': 0.7408, 'grad_norm': 3.9974398612976074, 'learning_rate': 5.493848830369697e-06} [Rank 3] Trainer log: {'loss': 0.7408, 'grad_norm': 3.9974398612976074, 'learning_rate': 5.493848830369697e-06} [Rank 0] Trainer log: {'loss': 0.7408, 'grad_norm': 3.9974398612976074, 'learning_rate': 5.493848830369697e-06} {'loss': 0.7408, 'grad_norm': 3.9974398612976074, 'learning_rate': 5.493848830369697e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 1.0071, 'grad_norm': 3.929058313369751, 'learning_rate': 5.4877622784888074e-06}[Rank 2] Trainer log: {'loss': 1.0071, 'grad_norm': 3.929058313369751, 'learning_rate': 5.4877622784888074e-06} [Rank 3] Trainer log: {'loss': 1.0071, 'grad_norm': 3.929058313369751, 'learning_rate': 5.4877622784888074e-06} [Rank 0] Trainer log: {'loss': 1.0071, 'grad_norm': 3.929058313369751, 'learning_rate': 5.4877622784888074e-06} {'loss': 1.0071, 'grad_norm': 3.929058313369751, 'learning_rate': 5.4877622784888074e-06, 'epoch': 0.67} [Rank 0] Trainer log: {'loss': 0.7948, 'grad_norm': 6.146097660064697, 'learning_rate': 5.4816778248484725e-06}[Rank 2] Trainer log: {'loss': 0.7948, 'grad_norm': 6.146097660064697, 'learning_rate': 5.4816778248484725e-06}[Rank 1] Trainer log: {'loss': 0.7948, 'grad_norm': 6.146097660064697, 'learning_rate': 5.4816778248484725e-06} [Rank 3] Trainer log: {'loss': 0.7948, 'grad_norm': 6.146097660064697, 'learning_rate': 5.4816778248484725e-06} {'loss': 0.7948, 'grad_norm': 6.146097660064697, 'learning_rate': 5.4816778248484725e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.8165, 'grad_norm': 6.9685492515563965, 'learning_rate': 5.4755954722780236e-06}[Rank 3] Trainer log: {'loss': 0.8165, 'grad_norm': 6.9685492515563965, 'learning_rate': 5.4755954722780236e-06}[Rank 0] Trainer log: {'loss': 0.8165, 'grad_norm': 6.9685492515563965, 'learning_rate': 5.4755954722780236e-06} [Rank 2] Trainer log: {'loss': 0.8165, 'grad_norm': 6.9685492515563965, 'learning_rate': 5.4755954722780236e-06} {'loss': 0.8165, 'grad_norm': 6.9685492515563965, 'learning_rate': 5.4755954722780236e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.6396, 'grad_norm': 2.045679807662964, 'learning_rate': 5.469515223605824e-06}[Rank 1] Trainer log: {'loss': 0.6396, 'grad_norm': 2.045679807662964, 'learning_rate': 5.469515223605824e-06}[Rank 3] Trainer log: {'loss': 0.6396, 'grad_norm': 2.045679807662964, 'learning_rate': 5.469515223605824e-06} [Rank 0] Trainer log: {'loss': 0.6396, 'grad_norm': 2.045679807662964, 'learning_rate': 5.469515223605824e-06} {'loss': 0.6396, 'grad_norm': 2.045679807662964, 'learning_rate': 5.469515223605824e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.636, 'grad_norm': 9.75929069519043, 'learning_rate': 5.4634370816592595e-06}[Rank 3] Trainer log: {'loss': 0.636, 'grad_norm': 9.75929069519043, 'learning_rate': 5.4634370816592595e-06}[Rank 1] Trainer log: {'loss': 0.636, 'grad_norm': 9.75929069519043, 'learning_rate': 5.4634370816592595e-06} [Rank 0] Trainer log: {'loss': 0.636, 'grad_norm': 9.75929069519043, 'learning_rate': 5.4634370816592595e-06} {'loss': 0.636, 'grad_norm': 9.75929069519043, 'learning_rate': 5.4634370816592595e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.6754, 'grad_norm': 9.502302169799805, 'learning_rate': 5.4573610492647335e-06} [Rank 2] Trainer log: {'loss': 0.6754, 'grad_norm': 9.502302169799805, 'learning_rate': 5.4573610492647335e-06} [Rank 3] Trainer log: {'loss': 0.6754, 'grad_norm': 9.502302169799805, 'learning_rate': 5.4573610492647335e-06}[Rank 0] Trainer log: {'loss': 0.6754, 'grad_norm': 9.502302169799805, 'learning_rate': 5.4573610492647335e-06} {'loss': 0.6754, 'grad_norm': 9.502302169799805, 'learning_rate': 5.4573610492647335e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.8346, 'grad_norm': 5.776238918304443, 'learning_rate': 5.451287129247662e-06} [Rank 1] Trainer log: {'loss': 0.8346, 'grad_norm': 5.776238918304443, 'learning_rate': 5.451287129247662e-06}[Rank 3] Trainer log: {'loss': 0.8346, 'grad_norm': 5.776238918304443, 'learning_rate': 5.451287129247662e-06} [Rank 0] Trainer log: {'loss': 0.8346, 'grad_norm': 5.776238918304443, 'learning_rate': 5.451287129247662e-06} {'loss': 0.8346, 'grad_norm': 5.776238918304443, 'learning_rate': 5.451287129247662e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.8698, 'grad_norm': 5.899791717529297, 'learning_rate': 5.445215324432493e-06} [Rank 1] Trainer log: {'loss': 0.8698, 'grad_norm': 5.899791717529297, 'learning_rate': 5.445215324432493e-06}[Rank 3] Trainer log: {'loss': 0.8698, 'grad_norm': 5.899791717529297, 'learning_rate': 5.445215324432493e-06} [Rank 0] Trainer log: {'loss': 0.8698, 'grad_norm': 5.899791717529297, 'learning_rate': 5.445215324432493e-06} {'loss': 0.8698, 'grad_norm': 5.899791717529297, 'learning_rate': 5.445215324432493e-06, 'epoch': 0.67} [Rank 0] Trainer log: {'loss': 0.8217, 'grad_norm': 2.54923939704895, 'learning_rate': 5.43914563764268e-06}[Rank 2] Trainer log: {'loss': 0.8217, 'grad_norm': 2.54923939704895, 'learning_rate': 5.43914563764268e-06} [Rank 3] Trainer log: {'loss': 0.8217, 'grad_norm': 2.54923939704895, 'learning_rate': 5.43914563764268e-06}[Rank 1] Trainer log: {'loss': 0.8217, 'grad_norm': 2.54923939704895, 'learning_rate': 5.43914563764268e-06} {'loss': 0.8217, 'grad_norm': 2.54923939704895, 'learning_rate': 5.43914563764268e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 0.7613, 'grad_norm': 13.120377540588379, 'learning_rate': 5.433078071700694e-06}[Rank 2] Trainer log: {'loss': 0.7613, 'grad_norm': 13.120377540588379, 'learning_rate': 5.433078071700694e-06}[Rank 1] Trainer log: {'loss': 0.7613, 'grad_norm': 13.120377540588379, 'learning_rate': 5.433078071700694e-06} [Rank 0] Trainer log: {'loss': 0.7613, 'grad_norm': 13.120377540588379, 'learning_rate': 5.433078071700694e-06} {'loss': 0.7613, 'grad_norm': 13.120377540588379, 'learning_rate': 5.433078071700694e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.8693, 'grad_norm': 6.624530792236328, 'learning_rate': 5.427012629428016e-06} [Rank 3] Trainer log: {'loss': 0.8693, 'grad_norm': 6.624530792236328, 'learning_rate': 5.427012629428016e-06} [Rank 1] Trainer log: {'loss': 0.8693, 'grad_norm': 6.624530792236328, 'learning_rate': 5.427012629428016e-06} [Rank 0] Trainer log: {'loss': 0.8693, 'grad_norm': 6.624530792236328, 'learning_rate': 5.427012629428016e-06} {'loss': 0.8693, 'grad_norm': 6.624530792236328, 'learning_rate': 5.427012629428016e-06, 'epoch': 0.67} [Rank 0] Trainer log: {'loss': 0.5728, 'grad_norm': 4.7785563468933105, 'learning_rate': 5.420949313645148e-06}[Rank 2] Trainer log: {'loss': 0.5728, 'grad_norm': 4.7785563468933105, 'learning_rate': 5.420949313645148e-06} [Rank 1] Trainer log: {'loss': 0.5728, 'grad_norm': 4.7785563468933105, 'learning_rate': 5.420949313645148e-06} [Rank 3] Trainer log: {'loss': 0.5728, 'grad_norm': 4.7785563468933105, 'learning_rate': 5.420949313645148e-06} {'loss': 0.5728, 'grad_norm': 4.7785563468933105, 'learning_rate': 5.420949313645148e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.6913, 'grad_norm': 9.46249008178711, 'learning_rate': 5.414888127171605e-06} [Rank 3] Trainer log: {'loss': 0.6913, 'grad_norm': 9.46249008178711, 'learning_rate': 5.414888127171605e-06} [Rank 1] Trainer log: {'loss': 0.6913, 'grad_norm': 9.46249008178711, 'learning_rate': 5.414888127171605e-06} [Rank 0] Trainer log: {'loss': 0.6913, 'grad_norm': 9.46249008178711, 'learning_rate': 5.414888127171605e-06} {'loss': 0.6913, 'grad_norm': 9.46249008178711, 'learning_rate': 5.414888127171605e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.9443, 'grad_norm': 4.304903507232666, 'learning_rate': 5.4088290728258954e-06}[Rank 0] Trainer log: {'loss': 0.9443, 'grad_norm': 4.304903507232666, 'learning_rate': 5.4088290728258954e-06} [Rank 1] Trainer log: {'loss': 0.9443, 'grad_norm': 4.304903507232666, 'learning_rate': 5.4088290728258954e-06} [Rank 3] Trainer log: {'loss': 0.9443, 'grad_norm': 4.304903507232666, 'learning_rate': 5.4088290728258954e-06} {'loss': 0.9443, 'grad_norm': 4.304903507232666, 'learning_rate': 5.4088290728258954e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.8041, 'grad_norm': 7.322120666503906, 'learning_rate': 5.402772153425555e-06}[Rank 2] Trainer log: {'loss': 0.8041, 'grad_norm': 7.322120666503906, 'learning_rate': 5.402772153425555e-06} [Rank 0] Trainer log: {'loss': 0.8041, 'grad_norm': 7.322120666503906, 'learning_rate': 5.402772153425555e-06} [Rank 3] Trainer log: {'loss': 0.8041, 'grad_norm': 7.322120666503906, 'learning_rate': 5.402772153425555e-06} {'loss': 0.8041, 'grad_norm': 7.322120666503906, 'learning_rate': 5.402772153425555e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.9446, 'grad_norm': 4.773384094238281, 'learning_rate': 5.396717371787117e-06}[Rank 3] Trainer log: {'loss': 0.9446, 'grad_norm': 4.773384094238281, 'learning_rate': 5.396717371787117e-06}[Rank 0] Trainer log: {'loss': 0.9446, 'grad_norm': 4.773384094238281, 'learning_rate': 5.396717371787117e-06} [Rank 1] Trainer log: {'loss': 0.9446, 'grad_norm': 4.773384094238281, 'learning_rate': 5.396717371787117e-06} {'loss': 0.9446, 'grad_norm': 4.773384094238281, 'learning_rate': 5.396717371787117e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 1.0834, 'grad_norm': 6.23907470703125, 'learning_rate': 5.3906647307261136e-06}[Rank 1] Trainer log: {'loss': 1.0834, 'grad_norm': 6.23907470703125, 'learning_rate': 5.3906647307261136e-06}[Rank 2] Trainer log: {'loss': 1.0834, 'grad_norm': 6.23907470703125, 'learning_rate': 5.3906647307261136e-06} [Rank 0] Trainer log: {'loss': 1.0834, 'grad_norm': 6.23907470703125, 'learning_rate': 5.3906647307261136e-06} {'loss': 1.0834, 'grad_norm': 6.23907470703125, 'learning_rate': 5.3906647307261136e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 0.9074, 'grad_norm': 7.096158504486084, 'learning_rate': 5.384614233057101e-06}[Rank 0] Trainer log: {'loss': 0.9074, 'grad_norm': 7.096158504486084, 'learning_rate': 5.384614233057101e-06}[Rank 2] Trainer log: {'loss': 0.9074, 'grad_norm': 7.096158504486084, 'learning_rate': 5.384614233057101e-06} [Rank 1] Trainer log: {'loss': 0.9074, 'grad_norm': 7.096158504486084, 'learning_rate': 5.384614233057101e-06} {'loss': 0.9074, 'grad_norm': 7.096158504486084, 'learning_rate': 5.384614233057101e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.7345, 'grad_norm': 6.496093273162842, 'learning_rate': 5.378565881593623e-06}[Rank 3] Trainer log: {'loss': 0.7345, 'grad_norm': 6.496093273162842, 'learning_rate': 5.378565881593623e-06} [Rank 0] Trainer log: {'loss': 0.7345, 'grad_norm': 6.496093273162842, 'learning_rate': 5.378565881593623e-06}[Rank 1] Trainer log: {'loss': 0.7345, 'grad_norm': 6.496093273162842, 'learning_rate': 5.378565881593623e-06} {'loss': 0.7345, 'grad_norm': 6.496093273162842, 'learning_rate': 5.378565881593623e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.725, 'grad_norm': 5.347692012786865, 'learning_rate': 5.372519679148227e-06} [Rank 0] Trainer log: {'loss': 0.725, 'grad_norm': 5.347692012786865, 'learning_rate': 5.372519679148227e-06}[Rank 3] Trainer log: {'loss': 0.725, 'grad_norm': 5.347692012786865, 'learning_rate': 5.372519679148227e-06} [Rank 1] Trainer log: {'loss': 0.725, 'grad_norm': 5.347692012786865, 'learning_rate': 5.372519679148227e-06} {'loss': 0.725, 'grad_norm': 5.347692012786865, 'learning_rate': 5.372519679148227e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.998, 'grad_norm': 6.410674571990967, 'learning_rate': 5.366475628532467e-06}[Rank 1] Trainer log: {'loss': 0.998, 'grad_norm': 6.410674571990967, 'learning_rate': 5.366475628532467e-06} [Rank 3] Trainer log: {'loss': 0.998, 'grad_norm': 6.410674571990967, 'learning_rate': 5.366475628532467e-06} [Rank 0] Trainer log: {'loss': 0.998, 'grad_norm': 6.410674571990967, 'learning_rate': 5.366475628532467e-06} {'loss': 0.998, 'grad_norm': 6.410674571990967, 'learning_rate': 5.366475628532467e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.8074, 'grad_norm': 5.404730796813965, 'learning_rate': 5.360433732556897e-06} [Rank 1] Trainer log: {'loss': 0.8074, 'grad_norm': 5.404730796813965, 'learning_rate': 5.360433732556897e-06} [Rank 0] Trainer log: {'loss': 0.8074, 'grad_norm': 5.404730796813965, 'learning_rate': 5.360433732556897e-06}[Rank 3] Trainer log: {'loss': 0.8074, 'grad_norm': 5.404730796813965, 'learning_rate': 5.360433732556897e-06} {'loss': 0.8074, 'grad_norm': 5.404730796813965, 'learning_rate': 5.360433732556897e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.7319, 'grad_norm': 2.5564653873443604, 'learning_rate': 5.35439399403106e-06} [Rank 1] Trainer log: {'loss': 0.7319, 'grad_norm': 2.5564653873443604, 'learning_rate': 5.35439399403106e-06} [Rank 3] Trainer log: {'loss': 0.7319, 'grad_norm': 2.5564653873443604, 'learning_rate': 5.35439399403106e-06} [Rank 0] Trainer log: {'loss': 0.7319, 'grad_norm': 2.5564653873443604, 'learning_rate': 5.35439399403106e-06} {'loss': 0.7319, 'grad_norm': 2.5564653873443604, 'learning_rate': 5.35439399403106e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.9527, 'grad_norm': 3.3564882278442383, 'learning_rate': 5.348356415763505e-06}[Rank 0] Trainer log: {'loss': 0.9527, 'grad_norm': 3.3564882278442383, 'learning_rate': 5.348356415763505e-06} [Rank 2] Trainer log: {'loss': 0.9527, 'grad_norm': 3.3564882278442383, 'learning_rate': 5.348356415763505e-06} [Rank 3] Trainer log: {'loss': 0.9527, 'grad_norm': 3.3564882278442383, 'learning_rate': 5.348356415763505e-06} {'loss': 0.9527, 'grad_norm': 3.3564882278442383, 'learning_rate': 5.348356415763505e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.8644, 'grad_norm': 1.5314944982528687, 'learning_rate': 5.342321000561774e-06} [Rank 0] Trainer log: {'loss': 0.8644, 'grad_norm': 1.5314944982528687, 'learning_rate': 5.342321000561774e-06}[Rank 2] Trainer log: {'loss': 0.8644, 'grad_norm': 1.5314944982528687, 'learning_rate': 5.342321000561774e-06} [Rank 3] Trainer log: {'loss': 0.8644, 'grad_norm': 1.5314944982528687, 'learning_rate': 5.342321000561774e-06} {'loss': 0.8644, 'grad_norm': 1.5314944982528687, 'learning_rate': 5.342321000561774e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.9291, 'grad_norm': 6.494006156921387, 'learning_rate': 5.336287751232394e-06} [Rank 0] Trainer log: {'loss': 0.9291, 'grad_norm': 6.494006156921387, 'learning_rate': 5.336287751232394e-06}[Rank 1] Trainer log: {'loss': 0.9291, 'grad_norm': 6.494006156921387, 'learning_rate': 5.336287751232394e-06} [Rank 3] Trainer log: {'loss': 0.9291, 'grad_norm': 6.494006156921387, 'learning_rate': 5.336287751232394e-06} {'loss': 0.9291, 'grad_norm': 6.494006156921387, 'learning_rate': 5.336287751232394e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.7647, 'grad_norm': 5.839931964874268, 'learning_rate': 5.330256670580902e-06} [Rank 0] Trainer log: {'loss': 0.7647, 'grad_norm': 5.839931964874268, 'learning_rate': 5.330256670580902e-06}[Rank 3] Trainer log: {'loss': 0.7647, 'grad_norm': 5.839931964874268, 'learning_rate': 5.330256670580902e-06} [Rank 1] Trainer log: {'loss': 0.7647, 'grad_norm': 5.839931964874268, 'learning_rate': 5.330256670580902e-06} {'loss': 0.7647, 'grad_norm': 5.839931964874268, 'learning_rate': 5.330256670580902e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.8606, 'grad_norm': 3.942594051361084, 'learning_rate': 5.3242277614118156e-06} [Rank 0] Trainer log: {'loss': 0.8606, 'grad_norm': 3.942594051361084, 'learning_rate': 5.3242277614118156e-06} [Rank 1] Trainer log: {'loss': 0.8606, 'grad_norm': 3.942594051361084, 'learning_rate': 5.3242277614118156e-06} {'loss': 0.8606, 'grad_norm': 3.942594051361084, 'learning_rate': 5.3242277614118156e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 0.8606, 'grad_norm': 3.942594051361084, 'learning_rate': 5.3242277614118156e-06} [Rank 2] Trainer log: {'loss': 1.0767, 'grad_norm': 5.108517646789551, 'learning_rate': 5.318201026528637e-06}[Rank 1] Trainer log: {'loss': 1.0767, 'grad_norm': 5.108517646789551, 'learning_rate': 5.318201026528637e-06} [Rank 0] Trainer log: {'loss': 1.0767, 'grad_norm': 5.108517646789551, 'learning_rate': 5.318201026528637e-06}[Rank 3] Trainer log: {'loss': 1.0767, 'grad_norm': 5.108517646789551, 'learning_rate': 5.318201026528637e-06} {'loss': 1.0767, 'grad_norm': 5.108517646789551, 'learning_rate': 5.318201026528637e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.7817, 'grad_norm': 2.5548129081726074, 'learning_rate': 5.31217646873387e-06}[Rank 2] Trainer log: {'loss': 0.7817, 'grad_norm': 2.5548129081726074, 'learning_rate': 5.31217646873387e-06}[Rank 3] Trainer log: {'loss': 0.7817, 'grad_norm': 2.5548129081726074, 'learning_rate': 5.31217646873387e-06} [Rank 0] Trainer log: {'loss': 0.7817, 'grad_norm': 2.5548129081726074, 'learning_rate': 5.31217646873387e-06} {'loss': 0.7817, 'grad_norm': 2.5548129081726074, 'learning_rate': 5.31217646873387e-06, 'epoch': 0.67} [Rank 0] Trainer log: {'loss': 0.8242, 'grad_norm': 4.747101306915283, 'learning_rate': 5.306154090829006e-06}[Rank 2] Trainer log: {'loss': 0.8242, 'grad_norm': 4.747101306915283, 'learning_rate': 5.306154090829006e-06} [Rank 1] Trainer log: {'loss': 0.8242, 'grad_norm': 4.747101306915283, 'learning_rate': 5.306154090829006e-06} [Rank 3] Trainer log: {'loss': 0.8242, 'grad_norm': 4.747101306915283, 'learning_rate': 5.306154090829006e-06} {'loss': 0.8242, 'grad_norm': 4.747101306915283, 'learning_rate': 5.306154090829006e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.7197, 'grad_norm': 2.7474730014801025, 'learning_rate': 5.300133895614506e-06}[Rank 1] Trainer log: {'loss': 0.7197, 'grad_norm': 2.7474730014801025, 'learning_rate': 5.300133895614506e-06} [Rank 0] Trainer log: {'loss': 0.7197, 'grad_norm': 2.7474730014801025, 'learning_rate': 5.300133895614506e-06} [Rank 3] Trainer log: {'loss': 0.7197, 'grad_norm': 2.7474730014801025, 'learning_rate': 5.300133895614506e-06} {'loss': 0.7197, 'grad_norm': 2.7474730014801025, 'learning_rate': 5.300133895614506e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 0.7554, 'grad_norm': 6.700623989105225, 'learning_rate': 5.294115885889839e-06}[Rank 1] Trainer log: {'loss': 0.7554, 'grad_norm': 6.700623989105225, 'learning_rate': 5.294115885889839e-06}[Rank 2] Trainer log: {'loss': 0.7554, 'grad_norm': 6.700623989105225, 'learning_rate': 5.294115885889839e-06} [Rank 0] Trainer log: {'loss': 0.7554, 'grad_norm': 6.700623989105225, 'learning_rate': 5.294115885889839e-06} {'loss': 0.7554, 'grad_norm': 6.700623989105225, 'learning_rate': 5.294115885889839e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 0.8048, 'grad_norm': 2.0643954277038574, 'learning_rate': 5.288100064453442e-06}[Rank 1] Trainer log: {'loss': 0.8048, 'grad_norm': 2.0643954277038574, 'learning_rate': 5.288100064453442e-06}[Rank 2] Trainer log: {'loss': 0.8048, 'grad_norm': 2.0643954277038574, 'learning_rate': 5.288100064453442e-06} [Rank 0] Trainer log: {'loss': 0.8048, 'grad_norm': 2.0643954277038574, 'learning_rate': 5.288100064453442e-06} {'loss': 0.8048, 'grad_norm': 2.0643954277038574, 'learning_rate': 5.288100064453442e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.8752, 'grad_norm': 1.8334397077560425, 'learning_rate': 5.282086434102738e-06} [Rank 0] Trainer log: {'loss': 0.8752, 'grad_norm': 1.8334397077560425, 'learning_rate': 5.282086434102738e-06}[Rank 2] Trainer log: {'loss': 0.8752, 'grad_norm': 1.8334397077560425, 'learning_rate': 5.282086434102738e-06} [Rank 3] Trainer log: {'loss': 0.8752, 'grad_norm': 1.8334397077560425, 'learning_rate': 5.282086434102738e-06} {'loss': 0.8752, 'grad_norm': 1.8334397077560425, 'learning_rate': 5.282086434102738e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.6742, 'grad_norm': 1.95305335521698, 'learning_rate': 5.276074997634126e-06}[Rank 0] Trainer log: {'loss': 0.6742, 'grad_norm': 1.95305335521698, 'learning_rate': 5.276074997634126e-06}[Rank 1] Trainer log: {'loss': 0.6742, 'grad_norm': 1.95305335521698, 'learning_rate': 5.276074997634126e-06} [Rank 3] Trainer log: {'loss': 0.6742, 'grad_norm': 1.95305335521698, 'learning_rate': 5.276074997634126e-06} {'loss': 0.6742, 'grad_norm': 1.95305335521698, 'learning_rate': 5.276074997634126e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.7603, 'grad_norm': 4.391607761383057, 'learning_rate': 5.270065757843e-06}[Rank 1] Trainer log: {'loss': 0.7603, 'grad_norm': 4.391607761383057, 'learning_rate': 5.270065757843e-06} [Rank 3] Trainer log: {'loss': 0.7603, 'grad_norm': 4.391607761383057, 'learning_rate': 5.270065757843e-06} [Rank 0] Trainer log: {'loss': 0.7603, 'grad_norm': 4.391607761383057, 'learning_rate': 5.270065757843e-06} {'loss': 0.7603, 'grad_norm': 4.391607761383057, 'learning_rate': 5.270065757843e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.9442, 'grad_norm': 2.968305826187134, 'learning_rate': 5.264058717523716e-06}[Rank 1] Trainer log: {'loss': 0.9442, 'grad_norm': 2.968305826187134, 'learning_rate': 5.264058717523716e-06}[Rank 3] Trainer log: {'loss': 0.9442, 'grad_norm': 2.968305826187134, 'learning_rate': 5.264058717523716e-06} [Rank 0] Trainer log: {'loss': 0.9442, 'grad_norm': 2.968305826187134, 'learning_rate': 5.264058717523716e-06} {'loss': 0.9442, 'grad_norm': 2.968305826187134, 'learning_rate': 5.264058717523716e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.7441, 'grad_norm': 2.7240874767303467, 'learning_rate': 5.258053879469617e-06}[Rank 2] Trainer log: {'loss': 0.7441, 'grad_norm': 2.7240874767303467, 'learning_rate': 5.258053879469617e-06} [Rank 3] Trainer log: {'loss': 0.7441, 'grad_norm': 2.7240874767303467, 'learning_rate': 5.258053879469617e-06} [Rank 0] Trainer log: {'loss': 0.7441, 'grad_norm': 2.7240874767303467, 'learning_rate': 5.258053879469617e-06} {'loss': 0.7441, 'grad_norm': 2.7240874767303467, 'learning_rate': 5.258053879469617e-06, 'epoch': 0.67} [Rank 3] Trainer log: {'loss': 0.8581, 'grad_norm': 7.062681674957275, 'learning_rate': 5.252051246473024e-06} [Rank 0] Trainer log: {'loss': 0.8581, 'grad_norm': 7.062681674957275, 'learning_rate': 5.252051246473024e-06}[Rank 2] Trainer log: {'loss': 0.8581, 'grad_norm': 7.062681674957275, 'learning_rate': 5.252051246473024e-06} [Rank 1] Trainer log: {'loss': 0.8581, 'grad_norm': 7.062681674957275, 'learning_rate': 5.252051246473024e-06} {'loss': 0.8581, 'grad_norm': 7.062681674957275, 'learning_rate': 5.252051246473024e-06, 'epoch': 0.67} [Rank 2] Trainer log: {'loss': 0.7033, 'grad_norm': 7.236454963684082, 'learning_rate': 5.246050821325224e-06}[Rank 0] Trainer log: {'loss': 0.7033, 'grad_norm': 7.236454963684082, 'learning_rate': 5.246050821325224e-06}[Rank 1] Trainer log: {'loss': 0.7033, 'grad_norm': 7.236454963684082, 'learning_rate': 5.246050821325224e-06} [Rank 3] Trainer log: {'loss': 0.7033, 'grad_norm': 7.236454963684082, 'learning_rate': 5.246050821325224e-06} {'loss': 0.7033, 'grad_norm': 7.236454963684082, 'learning_rate': 5.246050821325224e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.7218, 'grad_norm': 1.9302481412887573, 'learning_rate': 5.240052606816476e-06} [Rank 2] Trainer log: {'loss': 0.7218, 'grad_norm': 1.9302481412887573, 'learning_rate': 5.240052606816476e-06}[Rank 0] Trainer log: {'loss': 0.7218, 'grad_norm': 1.9302481412887573, 'learning_rate': 5.240052606816476e-06} [Rank 3] Trainer log: {'loss': 0.7218, 'grad_norm': 1.9302481412887573, 'learning_rate': 5.240052606816476e-06} {'loss': 0.7218, 'grad_norm': 1.9302481412887573, 'learning_rate': 5.240052606816476e-06, 'epoch': 0.67} [Rank 1] Trainer log: {'loss': 0.6561, 'grad_norm': 3.414271831512451, 'learning_rate': 5.234056605736025e-06} [Rank 0] Trainer log: {'loss': 0.6561, 'grad_norm': 3.414271831512451, 'learning_rate': 5.234056605736025e-06}[Rank 3] Trainer log: {'loss': 0.6561, 'grad_norm': 3.414271831512451, 'learning_rate': 5.234056605736025e-06} [Rank 2] Trainer log: {'loss': 0.6561, 'grad_norm': 3.414271831512451, 'learning_rate': 5.234056605736025e-06} {'loss': 0.6561, 'grad_norm': 3.414271831512451, 'learning_rate': 5.234056605736025e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.9859, 'grad_norm': 2.0587003231048584, 'learning_rate': 5.228062820872074e-06}[Rank 1] Trainer log: {'loss': 0.9859, 'grad_norm': 2.0587003231048584, 'learning_rate': 5.228062820872074e-06}[Rank 3] Trainer log: {'loss': 0.9859, 'grad_norm': 2.0587003231048584, 'learning_rate': 5.228062820872074e-06} [Rank 0] Trainer log: {'loss': 0.9859, 'grad_norm': 2.0587003231048584, 'learning_rate': 5.228062820872074e-06} {'loss': 0.9859, 'grad_norm': 2.0587003231048584, 'learning_rate': 5.228062820872074e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.7339, 'grad_norm': 8.409841537475586, 'learning_rate': 5.222071255011796e-06}[Rank 2] Trainer log: {'loss': 0.7339, 'grad_norm': 8.409841537475586, 'learning_rate': 5.222071255011796e-06}[Rank 3] Trainer log: {'loss': 0.7339, 'grad_norm': 8.409841537475586, 'learning_rate': 5.222071255011796e-06} [Rank 0] Trainer log: {'loss': 0.7339, 'grad_norm': 8.409841537475586, 'learning_rate': 5.222071255011796e-06} {'loss': 0.7339, 'grad_norm': 8.409841537475586, 'learning_rate': 5.222071255011796e-06, 'epoch': 0.68} [Rank 3] Trainer log: {'loss': 0.811, 'grad_norm': 5.675786972045898, 'learning_rate': 5.216081910941342e-06}[Rank 2] Trainer log: {'loss': 0.811, 'grad_norm': 5.675786972045898, 'learning_rate': 5.216081910941342e-06} [Rank 1] Trainer log: {'loss': 0.811, 'grad_norm': 5.675786972045898, 'learning_rate': 5.216081910941342e-06} [Rank 0] Trainer log: {'loss': 0.811, 'grad_norm': 5.675786972045898, 'learning_rate': 5.216081910941342e-06} {'loss': 0.811, 'grad_norm': 5.675786972045898, 'learning_rate': 5.216081910941342e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.6982, 'grad_norm': 3.005995512008667, 'learning_rate': 5.210094791445817e-06}[Rank 2] Trainer log: {'loss': 0.6982, 'grad_norm': 3.005995512008667, 'learning_rate': 5.210094791445817e-06} [Rank 3] Trainer log: {'loss': 0.6982, 'grad_norm': 3.005995512008667, 'learning_rate': 5.210094791445817e-06} [Rank 0] Trainer log: {'loss': 0.6982, 'grad_norm': 3.005995512008667, 'learning_rate': 5.210094791445817e-06} {'loss': 0.6982, 'grad_norm': 3.005995512008667, 'learning_rate': 5.210094791445817e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 1.0701, 'grad_norm': 3.3126072883605957, 'learning_rate': 5.204109899309301e-06}[Rank 3] Trainer log: {'loss': 1.0701, 'grad_norm': 3.3126072883605957, 'learning_rate': 5.204109899309301e-06}[Rank 1] Trainer log: {'loss': 1.0701, 'grad_norm': 3.3126072883605957, 'learning_rate': 5.204109899309301e-06} [Rank 0] Trainer log: {'loss': 1.0701, 'grad_norm': 3.3126072883605957, 'learning_rate': 5.204109899309301e-06} {'loss': 1.0701, 'grad_norm': 3.3126072883605957, 'learning_rate': 5.204109899309301e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.9148, 'grad_norm': 12.567536354064941, 'learning_rate': 5.1981272373148395e-06}[Rank 3] Trainer log: {'loss': 0.9148, 'grad_norm': 12.567536354064941, 'learning_rate': 5.1981272373148395e-06}[Rank 2] Trainer log: {'loss': 0.9148, 'grad_norm': 12.567536354064941, 'learning_rate': 5.1981272373148395e-06} [Rank 0] Trainer log: {'loss': 0.9148, 'grad_norm': 12.567536354064941, 'learning_rate': 5.1981272373148395e-06} {'loss': 0.9148, 'grad_norm': 12.567536354064941, 'learning_rate': 5.1981272373148395e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.9605, 'grad_norm': 2.771291971206665, 'learning_rate': 5.192146808244432e-06} [Rank 3] Trainer log: {'loss': 0.9605, 'grad_norm': 2.771291971206665, 'learning_rate': 5.192146808244432e-06} [Rank 0] Trainer log: {'loss': 0.9605, 'grad_norm': 2.771291971206665, 'learning_rate': 5.192146808244432e-06}[Rank 1] Trainer log: {'loss': 0.9605, 'grad_norm': 2.771291971206665, 'learning_rate': 5.192146808244432e-06} {'loss': 0.9605, 'grad_norm': 2.771291971206665, 'learning_rate': 5.192146808244432e-06, 'epoch': 0.68} [Rank 3] Trainer log: {'loss': 0.9598, 'grad_norm': 5.872015953063965, 'learning_rate': 5.186168614879041e-06}[Rank 2] Trainer log: {'loss': 0.9598, 'grad_norm': 5.872015953063965, 'learning_rate': 5.186168614879041e-06}[Rank 0] Trainer log: {'loss': 0.9598, 'grad_norm': 5.872015953063965, 'learning_rate': 5.186168614879041e-06} [Rank 1] Trainer log: {'loss': 0.9598, 'grad_norm': 5.872015953063965, 'learning_rate': 5.186168614879041e-06} {'loss': 0.9598, 'grad_norm': 5.872015953063965, 'learning_rate': 5.186168614879041e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.8988, 'grad_norm': 2.1754119396209717, 'learning_rate': 5.180192659998601e-06}[Rank 3] Trainer log: {'loss': 0.8988, 'grad_norm': 2.1754119396209717, 'learning_rate': 5.180192659998601e-06}[Rank 0] Trainer log: {'loss': 0.8988, 'grad_norm': 2.1754119396209717, 'learning_rate': 5.180192659998601e-06} [Rank 2] Trainer log: {'loss': 0.8988, 'grad_norm': 2.1754119396209717, 'learning_rate': 5.180192659998601e-06} {'loss': 0.8988, 'grad_norm': 2.1754119396209717, 'learning_rate': 5.180192659998601e-06, 'epoch': 0.68} [Rank 3] Trainer log: {'loss': 1.0294, 'grad_norm': 6.238635063171387, 'learning_rate': 5.174218946381993e-06}[Rank 1] Trainer log: {'loss': 1.0294, 'grad_norm': 6.238635063171387, 'learning_rate': 5.174218946381993e-06} [Rank 0] Trainer log: {'loss': 1.0294, 'grad_norm': 6.238635063171387, 'learning_rate': 5.174218946381993e-06} [Rank 2] Trainer log: {'loss': 1.0294, 'grad_norm': 6.238635063171387, 'learning_rate': 5.174218946381993e-06} {'loss': 1.0294, 'grad_norm': 6.238635063171387, 'learning_rate': 5.174218946381993e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.6436, 'grad_norm': 3.038994312286377, 'learning_rate': 5.168247476807054e-06}[Rank 1] Trainer log: {'loss': 0.6436, 'grad_norm': 3.038994312286377, 'learning_rate': 5.168247476807054e-06}[Rank 3] Trainer log: {'loss': 0.6436, 'grad_norm': 3.038994312286377, 'learning_rate': 5.168247476807054e-06} [Rank 0] Trainer log: {'loss': 0.6436, 'grad_norm': 3.038994312286377, 'learning_rate': 5.168247476807054e-06} {'loss': 0.6436, 'grad_norm': 3.038994312286377, 'learning_rate': 5.168247476807054e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.826, 'grad_norm': 4.359854698181152, 'learning_rate': 5.162278254050595e-06} [Rank 0] Trainer log: {'loss': 0.826, 'grad_norm': 4.359854698181152, 'learning_rate': 5.162278254050595e-06}[Rank 2] Trainer log: {'loss': 0.826, 'grad_norm': 4.359854698181152, 'learning_rate': 5.162278254050595e-06}[Rank 3] Trainer log: {'loss': 0.826, 'grad_norm': 4.359854698181152, 'learning_rate': 5.162278254050595e-06} {'loss': 0.826, 'grad_norm': 4.359854698181152, 'learning_rate': 5.162278254050595e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.7737, 'grad_norm': 4.221808910369873, 'learning_rate': 5.15631128088836e-06}[Rank 3] Trainer log: {'loss': 0.7737, 'grad_norm': 4.221808910369873, 'learning_rate': 5.15631128088836e-06}[Rank 2] Trainer log: {'loss': 0.7737, 'grad_norm': 4.221808910369873, 'learning_rate': 5.15631128088836e-06} [Rank 0] Trainer log: {'loss': 0.7737, 'grad_norm': 4.221808910369873, 'learning_rate': 5.15631128088836e-06} {'loss': 0.7737, 'grad_norm': 4.221808910369873, 'learning_rate': 5.15631128088836e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.9039, 'grad_norm': 3.348848581314087, 'learning_rate': 5.150346560095063e-06}[Rank 3] Trainer log: {'loss': 0.9039, 'grad_norm': 3.348848581314087, 'learning_rate': 5.150346560095063e-06}[Rank 1] Trainer log: {'loss': 0.9039, 'grad_norm': 3.348848581314087, 'learning_rate': 5.150346560095063e-06} [Rank 0] Trainer log: {'loss': 0.9039, 'grad_norm': 3.348848581314087, 'learning_rate': 5.150346560095063e-06} {'loss': 0.9039, 'grad_norm': 3.348848581314087, 'learning_rate': 5.150346560095063e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.5176, 'grad_norm': 11.792021751403809, 'learning_rate': 5.144384094444369e-06}[Rank 2] Trainer log: {'loss': 0.5176, 'grad_norm': 11.792021751403809, 'learning_rate': 5.144384094444369e-06} [Rank 3] Trainer log: {'loss': 0.5176, 'grad_norm': 11.792021751403809, 'learning_rate': 5.144384094444369e-06} [Rank 0] Trainer log: {'loss': 0.5176, 'grad_norm': 11.792021751403809, 'learning_rate': 5.144384094444369e-06} {'loss': 0.5176, 'grad_norm': 11.792021751403809, 'learning_rate': 5.144384094444369e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.9778, 'grad_norm': 6.58959436416626, 'learning_rate': 5.138423886708886e-06}[Rank 0] Trainer log: {'loss': 0.9778, 'grad_norm': 6.58959436416626, 'learning_rate': 5.138423886708886e-06}[Rank 3] Trainer log: {'loss': 0.9778, 'grad_norm': 6.58959436416626, 'learning_rate': 5.138423886708886e-06} {'loss': 0.9778, 'grad_norm': 6.58959436416626, 'learning_rate': 5.138423886708886e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.9778, 'grad_norm': 6.58959436416626, 'learning_rate': 5.138423886708886e-06} [Rank 2] Trainer log: {'loss': 0.831, 'grad_norm': 2.898977041244507, 'learning_rate': 5.132465939660174e-06}[Rank 3] Trainer log: {'loss': 0.831, 'grad_norm': 2.898977041244507, 'learning_rate': 5.132465939660174e-06}[Rank 1] Trainer log: {'loss': 0.831, 'grad_norm': 2.898977041244507, 'learning_rate': 5.132465939660174e-06} [Rank 0] Trainer log: {'loss': 0.831, 'grad_norm': 2.898977041244507, 'learning_rate': 5.132465939660174e-06} {'loss': 0.831, 'grad_norm': 2.898977041244507, 'learning_rate': 5.132465939660174e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.7767, 'grad_norm': 4.2872843742370605, 'learning_rate': 5.126510256068752e-06}[Rank 2] Trainer log: {'loss': 0.7767, 'grad_norm': 4.2872843742370605, 'learning_rate': 5.126510256068752e-06}[Rank 3] Trainer log: {'loss': 0.7767, 'grad_norm': 4.2872843742370605, 'learning_rate': 5.126510256068752e-06} [Rank 0] Trainer log: {'loss': 0.7767, 'grad_norm': 4.2872843742370605, 'learning_rate': 5.126510256068752e-06} {'loss': 0.7767, 'grad_norm': 4.2872843742370605, 'learning_rate': 5.126510256068752e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.7707, 'grad_norm': 9.978556632995605, 'learning_rate': 5.120556838704074e-06}[Rank 0] Trainer log: {'loss': 0.7707, 'grad_norm': 9.978556632995605, 'learning_rate': 5.120556838704074e-06}[Rank 3] Trainer log: {'loss': 0.7707, 'grad_norm': 9.978556632995605, 'learning_rate': 5.120556838704074e-06} [Rank 2] Trainer log: {'loss': 0.7707, 'grad_norm': 9.978556632995605, 'learning_rate': 5.120556838704074e-06} {'loss': 0.7707, 'grad_norm': 9.978556632995605, 'learning_rate': 5.120556838704074e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.6346, 'grad_norm': 21.465848922729492, 'learning_rate': 5.114605690334542e-06}[Rank 1] Trainer log: {'loss': 0.6346, 'grad_norm': 21.465848922729492, 'learning_rate': 5.114605690334542e-06}[Rank 3] Trainer log: {'loss': 0.6346, 'grad_norm': 21.465848922729492, 'learning_rate': 5.114605690334542e-06} [Rank 0] Trainer log: {'loss': 0.6346, 'grad_norm': 21.465848922729492, 'learning_rate': 5.114605690334542e-06} {'loss': 0.6346, 'grad_norm': 21.465848922729492, 'learning_rate': 5.114605690334542e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.8379, 'grad_norm': 5.864024639129639, 'learning_rate': 5.108656813727508e-06} [Rank 3] Trainer log: {'loss': 0.8379, 'grad_norm': 5.864024639129639, 'learning_rate': 5.108656813727508e-06}[Rank 2] Trainer log: {'loss': 0.8379, 'grad_norm': 5.864024639129639, 'learning_rate': 5.108656813727508e-06} [Rank 0] Trainer log: {'loss': 0.8379, 'grad_norm': 5.864024639129639, 'learning_rate': 5.108656813727508e-06} {'loss': 0.8379, 'grad_norm': 5.864024639129639, 'learning_rate': 5.108656813727508e-06, 'epoch': 0.68} [Rank 3] Trainer log: {'loss': 0.9029, 'grad_norm': 4.854063987731934, 'learning_rate': 5.102710211649273e-06}[Rank 1] Trainer log: {'loss': 0.9029, 'grad_norm': 4.854063987731934, 'learning_rate': 5.102710211649273e-06}[Rank 2] Trainer log: {'loss': 0.9029, 'grad_norm': 4.854063987731934, 'learning_rate': 5.102710211649273e-06} [Rank 0] Trainer log: {'loss': 0.9029, 'grad_norm': 4.854063987731934, 'learning_rate': 5.102710211649273e-06} {'loss': 0.9029, 'grad_norm': 4.854063987731934, 'learning_rate': 5.102710211649273e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.9586, 'grad_norm': 3.4101743698120117, 'learning_rate': 5.096765886865066e-06}[Rank 1] Trainer log: {'loss': 0.9586, 'grad_norm': 3.4101743698120117, 'learning_rate': 5.096765886865066e-06} [Rank 3] Trainer log: {'loss': 0.9586, 'grad_norm': 3.4101743698120117, 'learning_rate': 5.096765886865066e-06} [Rank 0] Trainer log: {'loss': 0.9586, 'grad_norm': 3.4101743698120117, 'learning_rate': 5.096765886865066e-06} {'loss': 0.9586, 'grad_norm': 3.4101743698120117, 'learning_rate': 5.096765886865066e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.7745, 'grad_norm': 11.885663032531738, 'learning_rate': 5.090823842139061e-06} [Rank 1] Trainer log: {'loss': 0.7745, 'grad_norm': 11.885663032531738, 'learning_rate': 5.090823842139061e-06} [Rank 0] Trainer log: {'loss': 0.7745, 'grad_norm': 11.885663032531738, 'learning_rate': 5.090823842139061e-06}[Rank 3] Trainer log: {'loss': 0.7745, 'grad_norm': 11.885663032531738, 'learning_rate': 5.090823842139061e-06} {'loss': 0.7745, 'grad_norm': 11.885663032531738, 'learning_rate': 5.090823842139061e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.7297, 'grad_norm': 5.155250549316406, 'learning_rate': 5.0848840802343826e-06}[Rank 1] Trainer log: {'loss': 0.7297, 'grad_norm': 5.155250549316406, 'learning_rate': 5.0848840802343826e-06} [Rank 3] Trainer log: {'loss': 0.7297, 'grad_norm': 5.155250549316406, 'learning_rate': 5.0848840802343826e-06} [Rank 0] Trainer log: {'loss': 0.7297, 'grad_norm': 5.155250549316406, 'learning_rate': 5.0848840802343826e-06} {'loss': 0.7297, 'grad_norm': 5.155250549316406, 'learning_rate': 5.0848840802343826e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.9527, 'grad_norm': 4.126802444458008, 'learning_rate': 5.0789466039130816e-06}[Rank 3] Trainer log: {'loss': 0.9527, 'grad_norm': 4.126802444458008, 'learning_rate': 5.0789466039130816e-06} [Rank 0] Trainer log: {'loss': 0.9527, 'grad_norm': 4.126802444458008, 'learning_rate': 5.0789466039130816e-06}[Rank 1] Trainer log: {'loss': 0.9527, 'grad_norm': 4.126802444458008, 'learning_rate': 5.0789466039130816e-06} {'loss': 0.9527, 'grad_norm': 4.126802444458008, 'learning_rate': 5.0789466039130816e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.753, 'grad_norm': 2.424243211746216, 'learning_rate': 5.073011415936147e-06}[Rank 2] Trainer log: {'loss': 0.753, 'grad_norm': 2.424243211746216, 'learning_rate': 5.073011415936147e-06}[Rank 0] Trainer log: {'loss': 0.753, 'grad_norm': 2.424243211746216, 'learning_rate': 5.073011415936147e-06} [Rank 3] Trainer log: {'loss': 0.753, 'grad_norm': 2.424243211746216, 'learning_rate': 5.073011415936147e-06} {'loss': 0.753, 'grad_norm': 2.424243211746216, 'learning_rate': 5.073011415936147e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.5097, 'grad_norm': 7.168045997619629, 'learning_rate': 5.067078519063514e-06} [Rank 2] Trainer log: {'loss': 0.5097, 'grad_norm': 7.168045997619629, 'learning_rate': 5.067078519063514e-06} [Rank 3] Trainer log: {'loss': 0.5097, 'grad_norm': 7.168045997619629, 'learning_rate': 5.067078519063514e-06}[Rank 0] Trainer log: {'loss': 0.5097, 'grad_norm': 7.168045997619629, 'learning_rate': 5.067078519063514e-06} {'loss': 0.5097, 'grad_norm': 7.168045997619629, 'learning_rate': 5.067078519063514e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.9655, 'grad_norm': 6.620745658874512, 'learning_rate': 5.0611479160540385e-06}[Rank 1] Trainer log: {'loss': 0.9655, 'grad_norm': 6.620745658874512, 'learning_rate': 5.0611479160540385e-06}[Rank 0] Trainer log: {'loss': 0.9655, 'grad_norm': 6.620745658874512, 'learning_rate': 5.0611479160540385e-06} [Rank 3] Trainer log: {'loss': 0.9655, 'grad_norm': 6.620745658874512, 'learning_rate': 5.0611479160540385e-06} {'loss': 0.9655, 'grad_norm': 6.620745658874512, 'learning_rate': 5.0611479160540385e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.9514, 'grad_norm': 4.771853446960449, 'learning_rate': 5.0552196096655195e-06} [Rank 3] Trainer log: {'loss': 0.9514, 'grad_norm': 4.771853446960449, 'learning_rate': 5.0552196096655195e-06} [Rank 1] Trainer log: {'loss': 0.9514, 'grad_norm': 4.771853446960449, 'learning_rate': 5.0552196096655195e-06} [Rank 0] Trainer log: {'loss': 0.9514, 'grad_norm': 4.771853446960449, 'learning_rate': 5.0552196096655195e-06} {'loss': 0.9514, 'grad_norm': 4.771853446960449, 'learning_rate': 5.0552196096655195e-06, 'epoch': 0.68} [Rank 0] Trainer log: {'loss': 0.6286, 'grad_norm': 8.557195663452148, 'learning_rate': 5.04929360265469e-06}[Rank 3] Trainer log: {'loss': 0.6286, 'grad_norm': 8.557195663452148, 'learning_rate': 5.04929360265469e-06}[Rank 1] Trainer log: {'loss': 0.6286, 'grad_norm': 8.557195663452148, 'learning_rate': 5.04929360265469e-06} [Rank 2] Trainer log: {'loss': 0.6286, 'grad_norm': 8.557195663452148, 'learning_rate': 5.04929360265469e-06} {'loss': 0.6286, 'grad_norm': 8.557195663452148, 'learning_rate': 5.04929360265469e-06, 'epoch': 0.68} [Rank 0] Trainer log: {'loss': 0.7482, 'grad_norm': 4.380929946899414, 'learning_rate': 5.043369897777204e-06}[Rank 1] Trainer log: {'loss': 0.7482, 'grad_norm': 4.380929946899414, 'learning_rate': 5.043369897777204e-06} [Rank 2] Trainer log: {'loss': 0.7482, 'grad_norm': 4.380929946899414, 'learning_rate': 5.043369897777204e-06}[Rank 3] Trainer log: {'loss': 0.7482, 'grad_norm': 4.380929946899414, 'learning_rate': 5.043369897777204e-06} {'loss': 0.7482, 'grad_norm': 4.380929946899414, 'learning_rate': 5.043369897777204e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.7173, 'grad_norm': 11.072699546813965, 'learning_rate': 5.037448497787647e-06}[Rank 2] Trainer log: {'loss': 0.7173, 'grad_norm': 11.072699546813965, 'learning_rate': 5.037448497787647e-06}[Rank 3] Trainer log: {'loss': 0.7173, 'grad_norm': 11.072699546813965, 'learning_rate': 5.037448497787647e-06} [Rank 0] Trainer log: {'loss': 0.7173, 'grad_norm': 11.072699546813965, 'learning_rate': 5.037448497787647e-06} {'loss': 0.7173, 'grad_norm': 11.072699546813965, 'learning_rate': 5.037448497787647e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.8183, 'grad_norm': 7.909434795379639, 'learning_rate': 5.031529405439544e-06}[Rank 1] Trainer log: {'loss': 0.8183, 'grad_norm': 7.909434795379639, 'learning_rate': 5.031529405439544e-06}[Rank 0] Trainer log: {'loss': 0.8183, 'grad_norm': 7.909434795379639, 'learning_rate': 5.031529405439544e-06} [Rank 3] Trainer log: {'loss': 0.8183, 'grad_norm': 7.909434795379639, 'learning_rate': 5.031529405439544e-06} {'loss': 0.8183, 'grad_norm': 7.909434795379639, 'learning_rate': 5.031529405439544e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.6239, 'grad_norm': 14.483028411865234, 'learning_rate': 5.025612623485332e-06}[Rank 2] Trainer log: {'loss': 0.6239, 'grad_norm': 14.483028411865234, 'learning_rate': 5.025612623485332e-06}[Rank 3] Trainer log: {'loss': 0.6239, 'grad_norm': 14.483028411865234, 'learning_rate': 5.025612623485332e-06} [Rank 0] Trainer log: {'loss': 0.6239, 'grad_norm': 14.483028411865234, 'learning_rate': 5.025612623485332e-06} {'loss': 0.6239, 'grad_norm': 14.483028411865234, 'learning_rate': 5.025612623485332e-06, 'epoch': 0.68} [Rank 3] Trainer log: {'loss': 0.8267, 'grad_norm': 15.109184265136719, 'learning_rate': 5.01969815467638e-06}[Rank 1] Trainer log: {'loss': 0.8267, 'grad_norm': 15.109184265136719, 'learning_rate': 5.01969815467638e-06}[Rank 2] Trainer log: {'loss': 0.8267, 'grad_norm': 15.109184265136719, 'learning_rate': 5.01969815467638e-06} [Rank 0] Trainer log: {'loss': 0.8267, 'grad_norm': 15.109184265136719, 'learning_rate': 5.01969815467638e-06} {'loss': 0.8267, 'grad_norm': 15.109184265136719, 'learning_rate': 5.01969815467638e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.8194, 'grad_norm': 3.385042905807495, 'learning_rate': 5.013786001762986e-06}[Rank 3] Trainer log: {'loss': 0.8194, 'grad_norm': 3.385042905807495, 'learning_rate': 5.013786001762986e-06}[Rank 1] Trainer log: {'loss': 0.8194, 'grad_norm': 3.385042905807495, 'learning_rate': 5.013786001762986e-06} [Rank 0] Trainer log: {'loss': 0.8194, 'grad_norm': 3.385042905807495, 'learning_rate': 5.013786001762986e-06} {'loss': 0.8194, 'grad_norm': 3.385042905807495, 'learning_rate': 5.013786001762986e-06, 'epoch': 0.68} [Rank 3] Trainer log: {'loss': 0.7334, 'grad_norm': 10.030415534973145, 'learning_rate': 5.007876167494361e-06}[Rank 2] Trainer log: {'loss': 0.7334, 'grad_norm': 10.030415534973145, 'learning_rate': 5.007876167494361e-06}[Rank 1] Trainer log: {'loss': 0.7334, 'grad_norm': 10.030415534973145, 'learning_rate': 5.007876167494361e-06} [Rank 0] Trainer log: {'loss': 0.7334, 'grad_norm': 10.030415534973145, 'learning_rate': 5.007876167494361e-06} {'loss': 0.7334, 'grad_norm': 10.030415534973145, 'learning_rate': 5.007876167494361e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.7732, 'grad_norm': 10.204215049743652, 'learning_rate': 5.001968654618646e-06} [Rank 1] Trainer log: {'loss': 0.7732, 'grad_norm': 10.204215049743652, 'learning_rate': 5.001968654618646e-06}[Rank 3] Trainer log: {'loss': 0.7732, 'grad_norm': 10.204215049743652, 'learning_rate': 5.001968654618646e-06}[Rank 0] Trainer log: {'loss': 0.7732, 'grad_norm': 10.204215049743652, 'learning_rate': 5.001968654618646e-06} {'loss': 0.7732, 'grad_norm': 10.204215049743652, 'learning_rate': 5.001968654618646e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.8429, 'grad_norm': 4.273606300354004, 'learning_rate': 4.996063465882904e-06}[Rank 1] Trainer log: {'loss': 0.8429, 'grad_norm': 4.273606300354004, 'learning_rate': 4.996063465882904e-06}[Rank 3] Trainer log: {'loss': 0.8429, 'grad_norm': 4.273606300354004, 'learning_rate': 4.996063465882904e-06} [Rank 0] Trainer log: {'loss': 0.8429, 'grad_norm': 4.273606300354004, 'learning_rate': 4.996063465882904e-06} {'loss': 0.8429, 'grad_norm': 4.273606300354004, 'learning_rate': 4.996063465882904e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.7673, 'grad_norm': 2.332062005996704, 'learning_rate': 4.9901606040331104e-06}[Rank 3] Trainer log: {'loss': 0.7673, 'grad_norm': 2.332062005996704, 'learning_rate': 4.9901606040331104e-06} [Rank 1] Trainer log: {'loss': 0.7673, 'grad_norm': 2.332062005996704, 'learning_rate': 4.9901606040331104e-06} [Rank 0] Trainer log: {'loss': 0.7673, 'grad_norm': 2.332062005996704, 'learning_rate': 4.9901606040331104e-06} {'loss': 0.7673, 'grad_norm': 2.332062005996704, 'learning_rate': 4.9901606040331104e-06, 'epoch': 0.68} [Rank 3] Trainer log: {'loss': 0.8241, 'grad_norm': 5.22057580947876, 'learning_rate': 4.9842600718141575e-06}[Rank 2] Trainer log: {'loss': 0.8241, 'grad_norm': 5.22057580947876, 'learning_rate': 4.9842600718141575e-06} [Rank 1] Trainer log: {'loss': 0.8241, 'grad_norm': 5.22057580947876, 'learning_rate': 4.9842600718141575e-06} [Rank 0] Trainer log: {'loss': 0.8241, 'grad_norm': 5.22057580947876, 'learning_rate': 4.9842600718141575e-06} {'loss': 0.8241, 'grad_norm': 5.22057580947876, 'learning_rate': 4.9842600718141575e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.8962, 'grad_norm': 6.7751994132995605, 'learning_rate': 4.9783618719698645e-06}[Rank 2] Trainer log: {'loss': 0.8962, 'grad_norm': 6.7751994132995605, 'learning_rate': 4.9783618719698645e-06}[Rank 3] Trainer log: {'loss': 0.8962, 'grad_norm': 6.7751994132995605, 'learning_rate': 4.9783618719698645e-06} [Rank 0] Trainer log: {'loss': 0.8962, 'grad_norm': 6.7751994132995605, 'learning_rate': 4.9783618719698645e-06} {'loss': 0.8962, 'grad_norm': 6.7751994132995605, 'learning_rate': 4.9783618719698645e-06, 'epoch': 0.68} [Rank 3] Trainer log: {'loss': 0.942, 'grad_norm': 2.6124203205108643, 'learning_rate': 4.9724660072429564e-06}[Rank 1] Trainer log: {'loss': 0.942, 'grad_norm': 2.6124203205108643, 'learning_rate': 4.9724660072429564e-06}[Rank 0] Trainer log: {'loss': 0.942, 'grad_norm': 2.6124203205108643, 'learning_rate': 4.9724660072429564e-06} [Rank 2] Trainer log: {'loss': 0.942, 'grad_norm': 2.6124203205108643, 'learning_rate': 4.9724660072429564e-06} {'loss': 0.942, 'grad_norm': 2.6124203205108643, 'learning_rate': 4.9724660072429564e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 0.6992, 'grad_norm': 2.857950210571289, 'learning_rate': 4.966572480375076e-06}[Rank 3] Trainer log: {'loss': 0.6992, 'grad_norm': 2.857950210571289, 'learning_rate': 4.966572480375076e-06}[Rank 1] Trainer log: {'loss': 0.6992, 'grad_norm': 2.857950210571289, 'learning_rate': 4.966572480375076e-06} [Rank 0] Trainer log: {'loss': 0.6992, 'grad_norm': 2.857950210571289, 'learning_rate': 4.966572480375076e-06} {'loss': 0.6992, 'grad_norm': 2.857950210571289, 'learning_rate': 4.966572480375076e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.7709, 'grad_norm': 5.020540237426758, 'learning_rate': 4.960681294106775e-06}[Rank 2] Trainer log: {'loss': 0.7709, 'grad_norm': 5.020540237426758, 'learning_rate': 4.960681294106775e-06}[Rank 3] Trainer log: {'loss': 0.7709, 'grad_norm': 5.020540237426758, 'learning_rate': 4.960681294106775e-06} [Rank 0] Trainer log: {'loss': 0.7709, 'grad_norm': 5.020540237426758, 'learning_rate': 4.960681294106775e-06} {'loss': 0.7709, 'grad_norm': 5.020540237426758, 'learning_rate': 4.960681294106775e-06, 'epoch': 0.68} [Rank 2] Trainer log: {'loss': 1.0372, 'grad_norm': 4.022956371307373, 'learning_rate': 4.954792451177522e-06}[Rank 3] Trainer log: {'loss': 1.0372, 'grad_norm': 4.022956371307373, 'learning_rate': 4.954792451177522e-06} [Rank 0] Trainer log: {'loss': 1.0372, 'grad_norm': 4.022956371307373, 'learning_rate': 4.954792451177522e-06}[Rank 1] Trainer log: {'loss': 1.0372, 'grad_norm': 4.022956371307373, 'learning_rate': 4.954792451177522e-06} {'loss': 1.0372, 'grad_norm': 4.022956371307373, 'learning_rate': 4.954792451177522e-06, 'epoch': 0.68} [Rank 1] Trainer log: {'loss': 0.8604, 'grad_norm': 2.689640760421753, 'learning_rate': 4.948905954325702e-06}[Rank 3] Trainer log: {'loss': 0.8604, 'grad_norm': 2.689640760421753, 'learning_rate': 4.948905954325702e-06}[Rank 2] Trainer log: {'loss': 0.8604, 'grad_norm': 2.689640760421753, 'learning_rate': 4.948905954325702e-06} [Rank 0] Trainer log: {'loss': 0.8604, 'grad_norm': 2.689640760421753, 'learning_rate': 4.948905954325702e-06} {'loss': 0.8604, 'grad_norm': 2.689640760421753, 'learning_rate': 4.948905954325702e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.8557, 'grad_norm': 5.5533246994018555, 'learning_rate': 4.9430218062885905e-06}[Rank 0] Trainer log: {'loss': 0.8557, 'grad_norm': 5.5533246994018555, 'learning_rate': 4.9430218062885905e-06}[Rank 1] Trainer log: {'loss': 0.8557, 'grad_norm': 5.5533246994018555, 'learning_rate': 4.9430218062885905e-06} [Rank 2] Trainer log: {'loss': 0.8557, 'grad_norm': 5.5533246994018555, 'learning_rate': 4.9430218062885905e-06} {'loss': 0.8557, 'grad_norm': 5.5533246994018555, 'learning_rate': 4.9430218062885905e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 1.0796, 'grad_norm': 5.48032808303833, 'learning_rate': 4.93714000980239e-06}[Rank 2] Trainer log: {'loss': 1.0796, 'grad_norm': 5.48032808303833, 'learning_rate': 4.93714000980239e-06} [Rank 1] Trainer log: {'loss': 1.0796, 'grad_norm': 5.48032808303833, 'learning_rate': 4.93714000980239e-06} [Rank 0] Trainer log: {'loss': 1.0796, 'grad_norm': 5.48032808303833, 'learning_rate': 4.93714000980239e-06} {'loss': 1.0796, 'grad_norm': 5.48032808303833, 'learning_rate': 4.93714000980239e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 1.0598, 'grad_norm': 2.4152426719665527, 'learning_rate': 4.9312605676021976e-06}[Rank 3] Trainer log: {'loss': 1.0598, 'grad_norm': 2.4152426719665527, 'learning_rate': 4.9312605676021976e-06} [Rank 1] Trainer log: {'loss': 1.0598, 'grad_norm': 2.4152426719665527, 'learning_rate': 4.9312605676021976e-06} [Rank 0] Trainer log: {'loss': 1.0598, 'grad_norm': 2.4152426719665527, 'learning_rate': 4.9312605676021976e-06} {'loss': 1.0598, 'grad_norm': 2.4152426719665527, 'learning_rate': 4.9312605676021976e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.5857, 'grad_norm': 9.35662841796875, 'learning_rate': 4.925383482422015e-06} [Rank 3] Trainer log: {'loss': 0.5857, 'grad_norm': 9.35662841796875, 'learning_rate': 4.925383482422015e-06} [Rank 1] Trainer log: {'loss': 0.5857, 'grad_norm': 9.35662841796875, 'learning_rate': 4.925383482422015e-06}[Rank 0] Trainer log: {'loss': 0.5857, 'grad_norm': 9.35662841796875, 'learning_rate': 4.925383482422015e-06} {'loss': 0.5857, 'grad_norm': 9.35662841796875, 'learning_rate': 4.925383482422015e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.7275, 'grad_norm': 16.011768341064453, 'learning_rate': 4.91950875699476e-06}[Rank 3] Trainer log: {'loss': 0.7275, 'grad_norm': 16.011768341064453, 'learning_rate': 4.91950875699476e-06} [Rank 0] Trainer log: {'loss': 0.7275, 'grad_norm': 16.011768341064453, 'learning_rate': 4.91950875699476e-06}[Rank 1] Trainer log: {'loss': 0.7275, 'grad_norm': 16.011768341064453, 'learning_rate': 4.91950875699476e-06} {'loss': 0.7275, 'grad_norm': 16.011768341064453, 'learning_rate': 4.91950875699476e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.8418, 'grad_norm': 8.475871086120605, 'learning_rate': 4.9136363940522394e-06}[Rank 3] Trainer log: {'loss': 0.8418, 'grad_norm': 8.475871086120605, 'learning_rate': 4.9136363940522394e-06} [Rank 1] Trainer log: {'loss': 0.8418, 'grad_norm': 8.475871086120605, 'learning_rate': 4.9136363940522394e-06} [Rank 0] Trainer log: {'loss': 0.8418, 'grad_norm': 8.475871086120605, 'learning_rate': 4.9136363940522394e-06} {'loss': 0.8418, 'grad_norm': 8.475871086120605, 'learning_rate': 4.9136363940522394e-06, 'epoch': 0.69} [Rank 1] Trainer log: {'loss': 0.6177, 'grad_norm': 6.07219123840332, 'learning_rate': 4.907766396325165e-06}[Rank 0] Trainer log: {'loss': 0.6177, 'grad_norm': 6.07219123840332, 'learning_rate': 4.907766396325165e-06} [Rank 2] Trainer log: {'loss': 0.6177, 'grad_norm': 6.07219123840332, 'learning_rate': 4.907766396325165e-06} [Rank 3] Trainer log: {'loss': 0.6177, 'grad_norm': 6.07219123840332, 'learning_rate': 4.907766396325165e-06} {'loss': 0.6177, 'grad_norm': 6.07219123840332, 'learning_rate': 4.907766396325165e-06, 'epoch': 0.69} [Rank 1] Trainer log: {'loss': 1.0121, 'grad_norm': 1.4330066442489624, 'learning_rate': 4.901898766543155e-06}[Rank 2] Trainer log: {'loss': 1.0121, 'grad_norm': 1.4330066442489624, 'learning_rate': 4.901898766543155e-06} [Rank 3] Trainer log: {'loss': 1.0121, 'grad_norm': 1.4330066442489624, 'learning_rate': 4.901898766543155e-06} [Rank 0] Trainer log: {'loss': 1.0121, 'grad_norm': 1.4330066442489624, 'learning_rate': 4.901898766543155e-06} {'loss': 1.0121, 'grad_norm': 1.4330066442489624, 'learning_rate': 4.901898766543155e-06, 'epoch': 0.69} [Rank 0] Trainer log: {'loss': 0.821, 'grad_norm': 4.235480308532715, 'learning_rate': 4.896033507434725e-06}[Rank 2] Trainer log: {'loss': 0.821, 'grad_norm': 4.235480308532715, 'learning_rate': 4.896033507434725e-06}[Rank 3] Trainer log: {'loss': 0.821, 'grad_norm': 4.235480308532715, 'learning_rate': 4.896033507434725e-06} [Rank 1] Trainer log: {'loss': 0.821, 'grad_norm': 4.235480308532715, 'learning_rate': 4.896033507434725e-06} {'loss': 0.821, 'grad_norm': 4.235480308532715, 'learning_rate': 4.896033507434725e-06, 'epoch': 0.69} [Rank 1] Trainer log: {'loss': 0.8011, 'grad_norm': 2.817990779876709, 'learning_rate': 4.89017062172728e-06}[Rank 2] Trainer log: {'loss': 0.8011, 'grad_norm': 2.817990779876709, 'learning_rate': 4.89017062172728e-06} [Rank 0] Trainer log: {'loss': 0.8011, 'grad_norm': 2.817990779876709, 'learning_rate': 4.89017062172728e-06}[Rank 3] Trainer log: {'loss': 0.8011, 'grad_norm': 2.817990779876709, 'learning_rate': 4.89017062172728e-06} {'loss': 0.8011, 'grad_norm': 2.817990779876709, 'learning_rate': 4.89017062172728e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.9775, 'grad_norm': 7.69783353805542, 'learning_rate': 4.884310112147131e-06}[Rank 1] Trainer log: {'loss': 0.9775, 'grad_norm': 7.69783353805542, 'learning_rate': 4.884310112147131e-06} [Rank 2] Trainer log: {'loss': 0.9775, 'grad_norm': 7.69783353805542, 'learning_rate': 4.884310112147131e-06} [Rank 0] Trainer log: {'loss': 0.9775, 'grad_norm': 7.69783353805542, 'learning_rate': 4.884310112147131e-06} {'loss': 0.9775, 'grad_norm': 7.69783353805542, 'learning_rate': 4.884310112147131e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.9211, 'grad_norm': 5.52374267578125, 'learning_rate': 4.878451981419477e-06} [Rank 3] Trainer log: {'loss': 0.9211, 'grad_norm': 5.52374267578125, 'learning_rate': 4.878451981419477e-06} [Rank 1] Trainer log: {'loss': 0.9211, 'grad_norm': 5.52374267578125, 'learning_rate': 4.878451981419477e-06} [Rank 0] Trainer log: {'loss': 0.9211, 'grad_norm': 5.52374267578125, 'learning_rate': 4.878451981419477e-06} {'loss': 0.9211, 'grad_norm': 5.52374267578125, 'learning_rate': 4.878451981419477e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.7953, 'grad_norm': 7.1923956871032715, 'learning_rate': 4.872596232268412e-06} [Rank 1] Trainer log: {'loss': 0.7953, 'grad_norm': 7.1923956871032715, 'learning_rate': 4.872596232268412e-06} [Rank 3] Trainer log: {'loss': 0.7953, 'grad_norm': 7.1923956871032715, 'learning_rate': 4.872596232268412e-06} [Rank 0] Trainer log: {'loss': 0.7953, 'grad_norm': 7.1923956871032715, 'learning_rate': 4.872596232268412e-06} {'loss': 0.7953, 'grad_norm': 7.1923956871032715, 'learning_rate': 4.872596232268412e-06, 'epoch': 0.69} [Rank 1] Trainer log: {'loss': 0.841, 'grad_norm': 5.9458465576171875, 'learning_rate': 4.86674286741693e-06} [Rank 2] Trainer log: {'loss': 0.841, 'grad_norm': 5.9458465576171875, 'learning_rate': 4.86674286741693e-06} [Rank 3] Trainer log: {'loss': 0.841, 'grad_norm': 5.9458465576171875, 'learning_rate': 4.86674286741693e-06} [Rank 0] Trainer log: {'loss': 0.841, 'grad_norm': 5.9458465576171875, 'learning_rate': 4.86674286741693e-06} {'loss': 0.841, 'grad_norm': 5.9458465576171875, 'learning_rate': 4.86674286741693e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.8797, 'grad_norm': 3.47592830657959, 'learning_rate': 4.8608918895869075e-06} [Rank 3] Trainer log: {'loss': 0.8797, 'grad_norm': 3.47592830657959, 'learning_rate': 4.8608918895869075e-06} [Rank 1] Trainer log: {'loss': 0.8797, 'grad_norm': 3.47592830657959, 'learning_rate': 4.8608918895869075e-06}[Rank 0] Trainer log: {'loss': 0.8797, 'grad_norm': 3.47592830657959, 'learning_rate': 4.8608918895869075e-06} {'loss': 0.8797, 'grad_norm': 3.47592830657959, 'learning_rate': 4.8608918895869075e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.97, 'grad_norm': 3.249403715133667, 'learning_rate': 4.8550433014991095e-06}[Rank 3] Trainer log: {'loss': 0.97, 'grad_norm': 3.249403715133667, 'learning_rate': 4.8550433014991095e-06} [Rank 0] Trainer log: {'loss': 0.97, 'grad_norm': 3.249403715133667, 'learning_rate': 4.8550433014991095e-06} [Rank 1] Trainer log: {'loss': 0.97, 'grad_norm': 3.249403715133667, 'learning_rate': 4.8550433014991095e-06} {'loss': 0.97, 'grad_norm': 3.249403715133667, 'learning_rate': 4.8550433014991095e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.6435, 'grad_norm': 2.7653913497924805, 'learning_rate': 4.849197105873199e-06}[Rank 1] Trainer log: {'loss': 0.6435, 'grad_norm': 2.7653913497924805, 'learning_rate': 4.849197105873199e-06} [Rank 0] Trainer log: {'loss': 0.6435, 'grad_norm': 2.7653913497924805, 'learning_rate': 4.849197105873199e-06}[Rank 3] Trainer log: {'loss': 0.6435, 'grad_norm': 2.7653913497924805, 'learning_rate': 4.849197105873199e-06} {'loss': 0.6435, 'grad_norm': 2.7653913497924805, 'learning_rate': 4.849197105873199e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.844, 'grad_norm': 4.089816570281982, 'learning_rate': 4.843353305427725e-06}[Rank 3] Trainer log: {'loss': 0.844, 'grad_norm': 4.089816570281982, 'learning_rate': 4.843353305427725e-06}[Rank 0] Trainer log: {'loss': 0.844, 'grad_norm': 4.089816570281982, 'learning_rate': 4.843353305427725e-06} [Rank 1] Trainer log: {'loss': 0.844, 'grad_norm': 4.089816570281982, 'learning_rate': 4.843353305427725e-06} {'loss': 0.844, 'grad_norm': 4.089816570281982, 'learning_rate': 4.843353305427725e-06, 'epoch': 0.69} [Rank 1] Trainer log: {'loss': 0.7318, 'grad_norm': 8.384546279907227, 'learning_rate': 4.837511902880112e-06}[Rank 3] Trainer log: {'loss': 0.7318, 'grad_norm': 8.384546279907227, 'learning_rate': 4.837511902880112e-06}[Rank 2] Trainer log: {'loss': 0.7318, 'grad_norm': 8.384546279907227, 'learning_rate': 4.837511902880112e-06} [Rank 0] Trainer log: {'loss': 0.7318, 'grad_norm': 8.384546279907227, 'learning_rate': 4.837511902880112e-06} {'loss': 0.7318, 'grad_norm': 8.384546279907227, 'learning_rate': 4.837511902880112e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.8351, 'grad_norm': 6.642648696899414, 'learning_rate': 4.831672900946686e-06} [Rank 0] Trainer log: {'loss': 0.8351, 'grad_norm': 6.642648696899414, 'learning_rate': 4.831672900946686e-06}[Rank 1] Trainer log: {'loss': 0.8351, 'grad_norm': 6.642648696899414, 'learning_rate': 4.831672900946686e-06} [Rank 3] Trainer log: {'loss': 0.8351, 'grad_norm': 6.642648696899414, 'learning_rate': 4.831672900946686e-06} {'loss': 0.8351, 'grad_norm': 6.642648696899414, 'learning_rate': 4.831672900946686e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 1.0556, 'grad_norm': 3.08756947517395, 'learning_rate': 4.82583630234264e-06}[Rank 3] Trainer log: {'loss': 1.0556, 'grad_norm': 3.08756947517395, 'learning_rate': 4.82583630234264e-06} [Rank 0] Trainer log: {'loss': 1.0556, 'grad_norm': 3.08756947517395, 'learning_rate': 4.82583630234264e-06} [Rank 1] Trainer log: {'loss': 1.0556, 'grad_norm': 3.08756947517395, 'learning_rate': 4.82583630234264e-06} {'loss': 1.0556, 'grad_norm': 3.08756947517395, 'learning_rate': 4.82583630234264e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.9089, 'grad_norm': 4.854649066925049, 'learning_rate': 4.820002109782062e-06}[Rank 2] Trainer log: {'loss': 0.9089, 'grad_norm': 4.854649066925049, 'learning_rate': 4.820002109782062e-06} [Rank 1] Trainer log: {'loss': 0.9089, 'grad_norm': 4.854649066925049, 'learning_rate': 4.820002109782062e-06} [Rank 0] Trainer log: {'loss': 0.9089, 'grad_norm': 4.854649066925049, 'learning_rate': 4.820002109782062e-06} {'loss': 0.9089, 'grad_norm': 4.854649066925049, 'learning_rate': 4.820002109782062e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.8945, 'grad_norm': 2.0605945587158203, 'learning_rate': 4.814170325977907e-06}[Rank 2] Trainer log: {'loss': 0.8945, 'grad_norm': 2.0605945587158203, 'learning_rate': 4.814170325977907e-06}[Rank 1] Trainer log: {'loss': 0.8945, 'grad_norm': 2.0605945587158203, 'learning_rate': 4.814170325977907e-06} [Rank 0] Trainer log: {'loss': 0.8945, 'grad_norm': 2.0605945587158203, 'learning_rate': 4.814170325977907e-06} {'loss': 0.8945, 'grad_norm': 2.0605945587158203, 'learning_rate': 4.814170325977907e-06, 'epoch': 0.69} [Rank 1] Trainer log: {'loss': 0.8843, 'grad_norm': 4.287668704986572, 'learning_rate': 4.8083409536420315e-06}[Rank 2] Trainer log: {'loss': 0.8843, 'grad_norm': 4.287668704986572, 'learning_rate': 4.8083409536420315e-06} [Rank 3] Trainer log: {'loss': 0.8843, 'grad_norm': 4.287668704986572, 'learning_rate': 4.8083409536420315e-06} [Rank 0] Trainer log: {'loss': 0.8843, 'grad_norm': 4.287668704986572, 'learning_rate': 4.8083409536420315e-06} {'loss': 0.8843, 'grad_norm': 4.287668704986572, 'learning_rate': 4.8083409536420315e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.8473, 'grad_norm': 4.257553577423096, 'learning_rate': 4.8025139954851485e-06} [Rank 1] Trainer log: {'loss': 0.8473, 'grad_norm': 4.257553577423096, 'learning_rate': 4.8025139954851485e-06} [Rank 0] Trainer log: {'loss': 0.8473, 'grad_norm': 4.257553577423096, 'learning_rate': 4.8025139954851485e-06}[Rank 2] Trainer log: {'loss': 0.8473, 'grad_norm': 4.257553577423096, 'learning_rate': 4.8025139954851485e-06} {'loss': 0.8473, 'grad_norm': 4.257553577423096, 'learning_rate': 4.8025139954851485e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.7699, 'grad_norm': 3.098492383956909, 'learning_rate': 4.796689454216861e-06}[Rank 1] Trainer log: {'loss': 0.7699, 'grad_norm': 3.098492383956909, 'learning_rate': 4.796689454216861e-06} [Rank 0] Trainer log: {'loss': 0.7699, 'grad_norm': 3.098492383956909, 'learning_rate': 4.796689454216861e-06} [Rank 3] Trainer log: {'loss': 0.7699, 'grad_norm': 3.098492383956909, 'learning_rate': 4.796689454216861e-06} {'loss': 0.7699, 'grad_norm': 3.098492383956909, 'learning_rate': 4.796689454216861e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 1.0198, 'grad_norm': 4.1390299797058105, 'learning_rate': 4.790867332545651e-06} [Rank 1] Trainer log: {'loss': 1.0198, 'grad_norm': 4.1390299797058105, 'learning_rate': 4.790867332545651e-06}[Rank 0] Trainer log: {'loss': 1.0198, 'grad_norm': 4.1390299797058105, 'learning_rate': 4.790867332545651e-06} [Rank 3] Trainer log: {'loss': 1.0198, 'grad_norm': 4.1390299797058105, 'learning_rate': 4.790867332545651e-06} {'loss': 1.0198, 'grad_norm': 4.1390299797058105, 'learning_rate': 4.790867332545651e-06, 'epoch': 0.69} [Rank 0] Trainer log: {'loss': 0.7231, 'grad_norm': 7.084643840789795, 'learning_rate': 4.785047633178864e-06}[Rank 2] Trainer log: {'loss': 0.7231, 'grad_norm': 7.084643840789795, 'learning_rate': 4.785047633178864e-06}[Rank 3] Trainer log: {'loss': 0.7231, 'grad_norm': 7.084643840789795, 'learning_rate': 4.785047633178864e-06} [Rank 1] Trainer log: {'loss': 0.7231, 'grad_norm': 7.084643840789795, 'learning_rate': 4.785047633178864e-06} {'loss': 0.7231, 'grad_norm': 7.084643840789795, 'learning_rate': 4.785047633178864e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 1.0421, 'grad_norm': 1.8917961120605469, 'learning_rate': 4.779230358822723e-06} [Rank 2] Trainer log: {'loss': 1.0421, 'grad_norm': 1.8917961120605469, 'learning_rate': 4.779230358822723e-06} [Rank 1] Trainer log: {'loss': 1.0421, 'grad_norm': 1.8917961120605469, 'learning_rate': 4.779230358822723e-06} [Rank 0] Trainer log: {'loss': 1.0421, 'grad_norm': 1.8917961120605469, 'learning_rate': 4.779230358822723e-06} {'loss': 1.0421, 'grad_norm': 1.8917961120605469, 'learning_rate': 4.779230358822723e-06, 'epoch': 0.69} [Rank 0] Trainer log: {'loss': 0.8082, 'grad_norm': 9.267499923706055, 'learning_rate': 4.773415512182331e-06}[Rank 3] Trainer log: {'loss': 0.8082, 'grad_norm': 9.267499923706055, 'learning_rate': 4.773415512182331e-06} [Rank 1] Trainer log: {'loss': 0.8082, 'grad_norm': 9.267499923706055, 'learning_rate': 4.773415512182331e-06} [Rank 2] Trainer log: {'loss': 0.8082, 'grad_norm': 9.267499923706055, 'learning_rate': 4.773415512182331e-06} {'loss': 0.8082, 'grad_norm': 9.267499923706055, 'learning_rate': 4.773415512182331e-06, 'epoch': 0.69} [Rank 1] Trainer log: {'loss': 0.882, 'grad_norm': 3.560696601867676, 'learning_rate': 4.767603095961652e-06}[Rank 3] Trainer log: {'loss': 0.882, 'grad_norm': 3.560696601867676, 'learning_rate': 4.767603095961652e-06}[Rank 2] Trainer log: {'loss': 0.882, 'grad_norm': 3.560696601867676, 'learning_rate': 4.767603095961652e-06} [Rank 0] Trainer log: {'loss': 0.882, 'grad_norm': 3.560696601867676, 'learning_rate': 4.767603095961652e-06} {'loss': 0.882, 'grad_norm': 3.560696601867676, 'learning_rate': 4.767603095961652e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.882, 'grad_norm': 2.2951014041900635, 'learning_rate': 4.761793112863523e-06} [Rank 1] Trainer log: {'loss': 0.882, 'grad_norm': 2.2951014041900635, 'learning_rate': 4.761793112863523e-06}[Rank 0] Trainer log: {'loss': 0.882, 'grad_norm': 2.2951014041900635, 'learning_rate': 4.761793112863523e-06} [Rank 3] Trainer log: {'loss': 0.882, 'grad_norm': 2.2951014041900635, 'learning_rate': 4.761793112863523e-06} {'loss': 0.882, 'grad_norm': 2.2951014041900635, 'learning_rate': 4.761793112863523e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.8804, 'grad_norm': 7.7773237228393555, 'learning_rate': 4.755985565589655e-06}[Rank 3] Trainer log: {'loss': 0.8804, 'grad_norm': 7.7773237228393555, 'learning_rate': 4.755985565589655e-06}[Rank 1] Trainer log: {'loss': 0.8804, 'grad_norm': 7.7773237228393555, 'learning_rate': 4.755985565589655e-06} [Rank 0] Trainer log: {'loss': 0.8804, 'grad_norm': 7.7773237228393555, 'learning_rate': 4.755985565589655e-06} {'loss': 0.8804, 'grad_norm': 7.7773237228393555, 'learning_rate': 4.755985565589655e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.722, 'grad_norm': 4.892534255981445, 'learning_rate': 4.750180456840615e-06}[Rank 1] Trainer log: {'loss': 0.722, 'grad_norm': 4.892534255981445, 'learning_rate': 4.750180456840615e-06} [Rank 0] Trainer log: {'loss': 0.722, 'grad_norm': 4.892534255981445, 'learning_rate': 4.750180456840615e-06}[Rank 3] Trainer log: {'loss': 0.722, 'grad_norm': 4.892534255981445, 'learning_rate': 4.750180456840615e-06} {'loss': 0.722, 'grad_norm': 4.892534255981445, 'learning_rate': 4.750180456840615e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.9281, 'grad_norm': 3.304466962814331, 'learning_rate': 4.744377789315848e-06} [Rank 3] Trainer log: {'loss': 0.9281, 'grad_norm': 3.304466962814331, 'learning_rate': 4.744377789315848e-06} [Rank 0] Trainer log: {'loss': 0.9281, 'grad_norm': 3.304466962814331, 'learning_rate': 4.744377789315848e-06}[Rank 1] Trainer log: {'loss': 0.9281, 'grad_norm': 3.304466962814331, 'learning_rate': 4.744377789315848e-06} {'loss': 0.9281, 'grad_norm': 3.304466962814331, 'learning_rate': 4.744377789315848e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.9318, 'grad_norm': 3.6370043754577637, 'learning_rate': 4.738577565713661e-06} [Rank 3] Trainer log: {'loss': 0.9318, 'grad_norm': 3.6370043754577637, 'learning_rate': 4.738577565713661e-06} [Rank 1] Trainer log: {'loss': 0.9318, 'grad_norm': 3.6370043754577637, 'learning_rate': 4.738577565713661e-06} [Rank 0] Trainer log: {'loss': 0.9318, 'grad_norm': 3.6370043754577637, 'learning_rate': 4.738577565713661e-06} {'loss': 0.9318, 'grad_norm': 3.6370043754577637, 'learning_rate': 4.738577565713661e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.8731, 'grad_norm': 2.5680787563323975, 'learning_rate': 4.732779788731219e-06}[Rank 2] Trainer log: {'loss': 0.8731, 'grad_norm': 2.5680787563323975, 'learning_rate': 4.732779788731219e-06}[Rank 1] Trainer log: {'loss': 0.8731, 'grad_norm': 2.5680787563323975, 'learning_rate': 4.732779788731219e-06} [Rank 0] Trainer log: {'loss': 0.8731, 'grad_norm': 2.5680787563323975, 'learning_rate': 4.732779788731219e-06} {'loss': 0.8731, 'grad_norm': 2.5680787563323975, 'learning_rate': 4.732779788731219e-06, 'epoch': 0.69} [Rank 1] Trainer log: {'loss': 0.9813, 'grad_norm': 3.4798543453216553, 'learning_rate': 4.726984461064549e-06}[Rank 2] Trainer log: {'loss': 0.9813, 'grad_norm': 3.4798543453216553, 'learning_rate': 4.726984461064549e-06} [Rank 0] Trainer log: {'loss': 0.9813, 'grad_norm': 3.4798543453216553, 'learning_rate': 4.726984461064549e-06}[Rank 3] Trainer log: {'loss': 0.9813, 'grad_norm': 3.4798543453216553, 'learning_rate': 4.726984461064549e-06} {'loss': 0.9813, 'grad_norm': 3.4798543453216553, 'learning_rate': 4.726984461064549e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.7653, 'grad_norm': 2.941128969192505, 'learning_rate': 4.72119158540855e-06} [Rank 0] Trainer log: {'loss': 0.7653, 'grad_norm': 2.941128969192505, 'learning_rate': 4.72119158540855e-06}[Rank 1] Trainer log: {'loss': 0.7653, 'grad_norm': 2.941128969192505, 'learning_rate': 4.72119158540855e-06}[Rank 3] Trainer log: {'loss': 0.7653, 'grad_norm': 2.941128969192505, 'learning_rate': 4.72119158540855e-06} {'loss': 0.7653, 'grad_norm': 2.941128969192505, 'learning_rate': 4.72119158540855e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.534, 'grad_norm': 3.869231939315796, 'learning_rate': 4.715401164456969e-06}[Rank 3] Trainer log: {'loss': 0.534, 'grad_norm': 3.869231939315796, 'learning_rate': 4.715401164456969e-06} [Rank 1] Trainer log: {'loss': 0.534, 'grad_norm': 3.869231939315796, 'learning_rate': 4.715401164456969e-06} [Rank 0] Trainer log: {'loss': 0.534, 'grad_norm': 3.869231939315796, 'learning_rate': 4.715401164456969e-06} {'loss': 0.534, 'grad_norm': 3.869231939315796, 'learning_rate': 4.715401164456969e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.7882, 'grad_norm': 10.843362808227539, 'learning_rate': 4.709613200902413e-06} [Rank 3] Trainer log: {'loss': 0.7882, 'grad_norm': 10.843362808227539, 'learning_rate': 4.709613200902413e-06}[Rank 0] Trainer log: {'loss': 0.7882, 'grad_norm': 10.843362808227539, 'learning_rate': 4.709613200902413e-06} [Rank 1] Trainer log: {'loss': 0.7882, 'grad_norm': 10.843362808227539, 'learning_rate': 4.709613200902413e-06} {'loss': 0.7882, 'grad_norm': 10.843362808227539, 'learning_rate': 4.709613200902413e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.8971, 'grad_norm': 2.238480806350708, 'learning_rate': 4.703827697436357e-06} [Rank 1] Trainer log: {'loss': 0.8971, 'grad_norm': 2.238480806350708, 'learning_rate': 4.703827697436357e-06} [Rank 0] Trainer log: {'loss': 0.8971, 'grad_norm': 2.238480806350708, 'learning_rate': 4.703827697436357e-06} [Rank 3] Trainer log: {'loss': 0.8971, 'grad_norm': 2.238480806350708, 'learning_rate': 4.703827697436357e-06} {'loss': 0.8971, 'grad_norm': 2.238480806350708, 'learning_rate': 4.703827697436357e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.7461, 'grad_norm': 5.3840651512146, 'learning_rate': 4.698044656749116e-06}[Rank 2] Trainer log: {'loss': 0.7461, 'grad_norm': 5.3840651512146, 'learning_rate': 4.698044656749116e-06}[Rank 1] Trainer log: {'loss': 0.7461, 'grad_norm': 5.3840651512146, 'learning_rate': 4.698044656749116e-06} [Rank 0] Trainer log: {'loss': 0.7461, 'grad_norm': 5.3840651512146, 'learning_rate': 4.698044656749116e-06} {'loss': 0.7461, 'grad_norm': 5.3840651512146, 'learning_rate': 4.698044656749116e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.8379, 'grad_norm': 5.476072788238525, 'learning_rate': 4.692264081529871e-06}[Rank 1] Trainer log: {'loss': 0.8379, 'grad_norm': 5.476072788238525, 'learning_rate': 4.692264081529871e-06} [Rank 2] Trainer log: {'loss': 0.8379, 'grad_norm': 5.476072788238525, 'learning_rate': 4.692264081529871e-06} [Rank 0] Trainer log: {'loss': 0.8379, 'grad_norm': 5.476072788238525, 'learning_rate': 4.692264081529871e-06} {'loss': 0.8379, 'grad_norm': 5.476072788238525, 'learning_rate': 4.692264081529871e-06, 'epoch': 0.69} [Rank 2] Trainer log: {'loss': 0.9409, 'grad_norm': 2.4060685634613037, 'learning_rate': 4.686485974466659e-06} [Rank 0] Trainer log: {'loss': 0.9409, 'grad_norm': 2.4060685634613037, 'learning_rate': 4.686485974466659e-06}[Rank 1] Trainer log: {'loss': 0.9409, 'grad_norm': 2.4060685634613037, 'learning_rate': 4.686485974466659e-06} [Rank 3] Trainer log: {'loss': 0.9409, 'grad_norm': 2.4060685634613037, 'learning_rate': 4.686485974466659e-06} {'loss': 0.9409, 'grad_norm': 2.4060685634613037, 'learning_rate': 4.686485974466659e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 1.004, 'grad_norm': 5.010932922363281, 'learning_rate': 4.68071033824636e-06}[Rank 2] Trainer log: {'loss': 1.004, 'grad_norm': 5.010932922363281, 'learning_rate': 4.68071033824636e-06} [Rank 0] Trainer log: {'loss': 1.004, 'grad_norm': 5.010932922363281, 'learning_rate': 4.68071033824636e-06}[Rank 1] Trainer log: {'loss': 1.004, 'grad_norm': 5.010932922363281, 'learning_rate': 4.68071033824636e-06} {'loss': 1.004, 'grad_norm': 5.010932922363281, 'learning_rate': 4.68071033824636e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 0.8689, 'grad_norm': 2.8636810779571533, 'learning_rate': 4.674937175554704e-06}[Rank 2] Trainer log: {'loss': 0.8689, 'grad_norm': 2.8636810779571533, 'learning_rate': 4.674937175554704e-06} [Rank 1] Trainer log: {'loss': 0.8689, 'grad_norm': 2.8636810779571533, 'learning_rate': 4.674937175554704e-06} [Rank 0] Trainer log: {'loss': 0.8689, 'grad_norm': 2.8636810779571533, 'learning_rate': 4.674937175554704e-06} {'loss': 0.8689, 'grad_norm': 2.8636810779571533, 'learning_rate': 4.674937175554704e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 1.0892, 'grad_norm': 1.7842882871627808, 'learning_rate': 4.669166489076283e-06} [Rank 1] Trainer log: {'loss': 1.0892, 'grad_norm': 1.7842882871627808, 'learning_rate': 4.669166489076283e-06} [Rank 2] Trainer log: {'loss': 1.0892, 'grad_norm': 1.7842882871627808, 'learning_rate': 4.669166489076283e-06} [Rank 0] Trainer log: {'loss': 1.0892, 'grad_norm': 1.7842882871627808, 'learning_rate': 4.669166489076283e-06} {'loss': 1.0892, 'grad_norm': 1.7842882871627808, 'learning_rate': 4.669166489076283e-06, 'epoch': 0.69} [Rank 3] Trainer log: {'loss': 1.0255, 'grad_norm': 6.621767997741699, 'learning_rate': 4.663398281494528e-06}[Rank 2] Trainer log: {'loss': 1.0255, 'grad_norm': 6.621767997741699, 'learning_rate': 4.663398281494528e-06}[Rank 1] Trainer log: {'loss': 1.0255, 'grad_norm': 6.621767997741699, 'learning_rate': 4.663398281494528e-06} [Rank 0] Trainer log: {'loss': 1.0255, 'grad_norm': 6.621767997741699, 'learning_rate': 4.663398281494528e-06} {'loss': 1.0255, 'grad_norm': 6.621767997741699, 'learning_rate': 4.663398281494528e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.9868, 'grad_norm': 3.4172632694244385, 'learning_rate': 4.657632555491713e-06}[Rank 1] Trainer log: {'loss': 0.9868, 'grad_norm': 3.4172632694244385, 'learning_rate': 4.657632555491713e-06}[Rank 0] Trainer log: {'loss': 0.9868, 'grad_norm': 3.4172632694244385, 'learning_rate': 4.657632555491713e-06} [Rank 3] Trainer log: {'loss': 0.9868, 'grad_norm': 3.4172632694244385, 'learning_rate': 4.657632555491713e-06} {'loss': 0.9868, 'grad_norm': 3.4172632694244385, 'learning_rate': 4.657632555491713e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.7724, 'grad_norm': 2.130918502807617, 'learning_rate': 4.651869313748971e-06}[Rank 2] Trainer log: {'loss': 0.7724, 'grad_norm': 2.130918502807617, 'learning_rate': 4.651869313748971e-06} [Rank 0] Trainer log: {'loss': 0.7724, 'grad_norm': 2.130918502807617, 'learning_rate': 4.651869313748971e-06}[Rank 3] Trainer log: {'loss': 0.7724, 'grad_norm': 2.130918502807617, 'learning_rate': 4.651869313748971e-06} {'loss': 0.7724, 'grad_norm': 2.130918502807617, 'learning_rate': 4.651869313748971e-06, 'epoch': 0.7} [Rank 0] Trainer log: {'loss': 0.774, 'grad_norm': 2.5997090339660645, 'learning_rate': 4.646108558946277e-06}[Rank 3] Trainer log: {'loss': 0.774, 'grad_norm': 2.5997090339660645, 'learning_rate': 4.646108558946277e-06} [Rank 1] Trainer log: {'loss': 0.774, 'grad_norm': 2.5997090339660645, 'learning_rate': 4.646108558946277e-06} [Rank 2] Trainer log: {'loss': 0.774, 'grad_norm': 2.5997090339660645, 'learning_rate': 4.646108558946277e-06} {'loss': 0.774, 'grad_norm': 2.5997090339660645, 'learning_rate': 4.646108558946277e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.7612, 'grad_norm': 2.1862356662750244, 'learning_rate': 4.6403502937624386e-06} [Rank 3] Trainer log: {'loss': 0.7612, 'grad_norm': 2.1862356662750244, 'learning_rate': 4.6403502937624386e-06}[Rank 0] Trainer log: {'loss': 0.7612, 'grad_norm': 2.1862356662750244, 'learning_rate': 4.6403502937624386e-06} [Rank 2] Trainer log: {'loss': 0.7612, 'grad_norm': 2.1862356662750244, 'learning_rate': 4.6403502937624386e-06} {'loss': 0.7612, 'grad_norm': 2.1862356662750244, 'learning_rate': 4.6403502937624386e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.8998, 'grad_norm': 5.279348373413086, 'learning_rate': 4.634594520875113e-06}[Rank 3] Trainer log: {'loss': 0.8998, 'grad_norm': 5.279348373413086, 'learning_rate': 4.634594520875113e-06}[Rank 2] Trainer log: {'loss': 0.8998, 'grad_norm': 5.279348373413086, 'learning_rate': 4.634594520875113e-06} [Rank 0] Trainer log: {'loss': 0.8998, 'grad_norm': 5.279348373413086, 'learning_rate': 4.634594520875113e-06} {'loss': 0.8998, 'grad_norm': 5.279348373413086, 'learning_rate': 4.634594520875113e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.8814, 'grad_norm': 8.738509178161621, 'learning_rate': 4.628841242960805e-06}[Rank 2] Trainer log: {'loss': 0.8814, 'grad_norm': 8.738509178161621, 'learning_rate': 4.628841242960805e-06} [Rank 3] Trainer log: {'loss': 0.8814, 'grad_norm': 8.738509178161621, 'learning_rate': 4.628841242960805e-06} [Rank 0] Trainer log: {'loss': 0.8814, 'grad_norm': 8.738509178161621, 'learning_rate': 4.628841242960805e-06} {'loss': 0.8814, 'grad_norm': 8.738509178161621, 'learning_rate': 4.628841242960805e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.8562, 'grad_norm': 6.62926721572876, 'learning_rate': 4.623090462694847e-06}[Rank 2] Trainer log: {'loss': 0.8562, 'grad_norm': 6.62926721572876, 'learning_rate': 4.623090462694847e-06} [Rank 0] Trainer log: {'loss': 0.8562, 'grad_norm': 6.62926721572876, 'learning_rate': 4.623090462694847e-06} [Rank 3] Trainer log: {'loss': 0.8562, 'grad_norm': 6.62926721572876, 'learning_rate': 4.623090462694847e-06} {'loss': 0.8562, 'grad_norm': 6.62926721572876, 'learning_rate': 4.623090462694847e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.8947, 'grad_norm': 2.363060712814331, 'learning_rate': 4.6173421827514145e-06}[Rank 1] Trainer log: {'loss': 0.8947, 'grad_norm': 2.363060712814331, 'learning_rate': 4.6173421827514145e-06} [Rank 3] Trainer log: {'loss': 0.8947, 'grad_norm': 2.363060712814331, 'learning_rate': 4.6173421827514145e-06} [Rank 0] Trainer log: {'loss': 0.8947, 'grad_norm': 2.363060712814331, 'learning_rate': 4.6173421827514145e-06} {'loss': 0.8947, 'grad_norm': 2.363060712814331, 'learning_rate': 4.6173421827514145e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.9134, 'grad_norm': 3.6608030796051025, 'learning_rate': 4.611596405803526e-06} [Rank 0] Trainer log: {'loss': 0.9134, 'grad_norm': 3.6608030796051025, 'learning_rate': 4.611596405803526e-06}[Rank 3] Trainer log: {'loss': 0.9134, 'grad_norm': 3.6608030796051025, 'learning_rate': 4.611596405803526e-06} [Rank 1] Trainer log: {'loss': 0.9134, 'grad_norm': 3.6608030796051025, 'learning_rate': 4.611596405803526e-06} {'loss': 0.9134, 'grad_norm': 3.6608030796051025, 'learning_rate': 4.611596405803526e-06, 'epoch': 0.7} [Rank 3] Trainer log: {'loss': 0.9185, 'grad_norm': 4.471099853515625, 'learning_rate': 4.6058531345230274e-06} [Rank 2] Trainer log: {'loss': 0.9185, 'grad_norm': 4.471099853515625, 'learning_rate': 4.6058531345230274e-06} [Rank 1] Trainer log: {'loss': 0.9185, 'grad_norm': 4.471099853515625, 'learning_rate': 4.6058531345230274e-06} [Rank 0] Trainer log: {'loss': 0.9185, 'grad_norm': 4.471099853515625, 'learning_rate': 4.6058531345230274e-06} {'loss': 0.9185, 'grad_norm': 4.471099853515625, 'learning_rate': 4.6058531345230274e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.8962, 'grad_norm': 3.2466225624084473, 'learning_rate': 4.600112371580604e-06}[Rank 1] Trainer log: {'loss': 0.8962, 'grad_norm': 3.2466225624084473, 'learning_rate': 4.600112371580604e-06}[Rank 3] Trainer log: {'loss': 0.8962, 'grad_norm': 3.2466225624084473, 'learning_rate': 4.600112371580604e-06} [Rank 0] Trainer log: {'loss': 0.8962, 'grad_norm': 3.2466225624084473, 'learning_rate': 4.600112371580604e-06} {'loss': 0.8962, 'grad_norm': 3.2466225624084473, 'learning_rate': 4.600112371580604e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.5854, 'grad_norm': 3.3084473609924316, 'learning_rate': 4.59437411964578e-06}[Rank 3] Trainer log: {'loss': 0.5854, 'grad_norm': 3.3084473609924316, 'learning_rate': 4.59437411964578e-06} [Rank 1] Trainer log: {'loss': 0.5854, 'grad_norm': 3.3084473609924316, 'learning_rate': 4.59437411964578e-06} [Rank 0] Trainer log: {'loss': 0.5854, 'grad_norm': 3.3084473609924316, 'learning_rate': 4.59437411964578e-06} {'loss': 0.5854, 'grad_norm': 3.3084473609924316, 'learning_rate': 4.59437411964578e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 1.0264, 'grad_norm': 2.6975133419036865, 'learning_rate': 4.5886383813869016e-06}[Rank 3] Trainer log: {'loss': 1.0264, 'grad_norm': 2.6975133419036865, 'learning_rate': 4.5886383813869016e-06} [Rank 1] Trainer log: {'loss': 1.0264, 'grad_norm': 2.6975133419036865, 'learning_rate': 4.5886383813869016e-06} [Rank 0] Trainer log: {'loss': 1.0264, 'grad_norm': 2.6975133419036865, 'learning_rate': 4.5886383813869016e-06} {'loss': 1.0264, 'grad_norm': 2.6975133419036865, 'learning_rate': 4.5886383813869016e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.7336, 'grad_norm': 5.086080551147461, 'learning_rate': 4.582905159471147e-06}[Rank 3] Trainer log: {'loss': 0.7336, 'grad_norm': 5.086080551147461, 'learning_rate': 4.582905159471147e-06} [Rank 1] Trainer log: {'loss': 0.7336, 'grad_norm': 5.086080551147461, 'learning_rate': 4.582905159471147e-06} [Rank 0] Trainer log: {'loss': 0.7336, 'grad_norm': 5.086080551147461, 'learning_rate': 4.582905159471147e-06} {'loss': 0.7336, 'grad_norm': 5.086080551147461, 'learning_rate': 4.582905159471147e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 1.021, 'grad_norm': 1.9695489406585693, 'learning_rate': 4.577174456564535e-06} [Rank 2] Trainer log: {'loss': 1.021, 'grad_norm': 1.9695489406585693, 'learning_rate': 4.577174456564535e-06} [Rank 0] Trainer log: {'loss': 1.021, 'grad_norm': 1.9695489406585693, 'learning_rate': 4.577174456564535e-06} {'loss': 1.021, 'grad_norm': 1.9695489406585693, 'learning_rate': 4.577174456564535e-06, 'epoch': 0.7} [Rank 3] Trainer log: {'loss': 1.021, 'grad_norm': 1.9695489406585693, 'learning_rate': 4.577174456564535e-06} [Rank 2] Trainer log: {'loss': 0.7891, 'grad_norm': 2.4865736961364746, 'learning_rate': 4.571446275331903e-06}[Rank 1] Trainer log: {'loss': 0.7891, 'grad_norm': 2.4865736961364746, 'learning_rate': 4.571446275331903e-06}[Rank 3] Trainer log: {'loss': 0.7891, 'grad_norm': 2.4865736961364746, 'learning_rate': 4.571446275331903e-06} [Rank 0] Trainer log: {'loss': 0.7891, 'grad_norm': 2.4865736961364746, 'learning_rate': 4.571446275331903e-06} {'loss': 0.7891, 'grad_norm': 2.4865736961364746, 'learning_rate': 4.571446275331903e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.4516, 'grad_norm': 8.76771354675293, 'learning_rate': 4.565720618436915e-06}[Rank 1] Trainer log: {'loss': 0.4516, 'grad_norm': 8.76771354675293, 'learning_rate': 4.565720618436915e-06} [Rank 3] Trainer log: {'loss': 0.4516, 'grad_norm': 8.76771354675293, 'learning_rate': 4.565720618436915e-06} [Rank 0] Trainer log: {'loss': 0.4516, 'grad_norm': 8.76771354675293, 'learning_rate': 4.565720618436915e-06} {'loss': 0.4516, 'grad_norm': 8.76771354675293, 'learning_rate': 4.565720618436915e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.924, 'grad_norm': 3.3695085048675537, 'learning_rate': 4.559997488542072e-06} [Rank 3] Trainer log: {'loss': 0.924, 'grad_norm': 3.3695085048675537, 'learning_rate': 4.559997488542072e-06}[Rank 1] Trainer log: {'loss': 0.924, 'grad_norm': 3.3695085048675537, 'learning_rate': 4.559997488542072e-06} [Rank 0] Trainer log: {'loss': 0.924, 'grad_norm': 3.3695085048675537, 'learning_rate': 4.559997488542072e-06} {'loss': 0.924, 'grad_norm': 3.3695085048675537, 'learning_rate': 4.559997488542072e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.4935, 'grad_norm': 1.9949344396591187, 'learning_rate': 4.554276888308684e-06}[Rank 3] Trainer log: {'loss': 0.4935, 'grad_norm': 1.9949344396591187, 'learning_rate': 4.554276888308684e-06} [Rank 2] Trainer log: {'loss': 0.4935, 'grad_norm': 1.9949344396591187, 'learning_rate': 4.554276888308684e-06} [Rank 0] Trainer log: {'loss': 0.4935, 'grad_norm': 1.9949344396591187, 'learning_rate': 4.554276888308684e-06} {'loss': 0.4935, 'grad_norm': 1.9949344396591187, 'learning_rate': 4.554276888308684e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.6761, 'grad_norm': 5.3305439949035645, 'learning_rate': 4.5485588203969e-06} [Rank 0] Trainer log: {'loss': 0.6761, 'grad_norm': 5.3305439949035645, 'learning_rate': 4.5485588203969e-06}[Rank 3] Trainer log: {'loss': 0.6761, 'grad_norm': 5.3305439949035645, 'learning_rate': 4.5485588203969e-06} [Rank 1] Trainer log: {'loss': 0.6761, 'grad_norm': 5.3305439949035645, 'learning_rate': 4.5485588203969e-06} {'loss': 0.6761, 'grad_norm': 5.3305439949035645, 'learning_rate': 4.5485588203969e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.7364, 'grad_norm': 10.820547103881836, 'learning_rate': 4.542843287465687e-06}[Rank 2] Trainer log: {'loss': 0.7364, 'grad_norm': 10.820547103881836, 'learning_rate': 4.542843287465687e-06}[Rank 0] Trainer log: {'loss': 0.7364, 'grad_norm': 10.820547103881836, 'learning_rate': 4.542843287465687e-06} [Rank 3] Trainer log: {'loss': 0.7364, 'grad_norm': 10.820547103881836, 'learning_rate': 4.542843287465687e-06} {'loss': 0.7364, 'grad_norm': 10.820547103881836, 'learning_rate': 4.542843287465687e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.9558, 'grad_norm': 10.668102264404297, 'learning_rate': 4.537130292172828e-06}[Rank 1] Trainer log: {'loss': 0.9558, 'grad_norm': 10.668102264404297, 'learning_rate': 4.537130292172828e-06} [Rank 3] Trainer log: {'loss': 0.9558, 'grad_norm': 10.668102264404297, 'learning_rate': 4.537130292172828e-06} [Rank 0] Trainer log: {'loss': 0.9558, 'grad_norm': 10.668102264404297, 'learning_rate': 4.537130292172828e-06} {'loss': 0.9558, 'grad_norm': 10.668102264404297, 'learning_rate': 4.537130292172828e-06, 'epoch': 0.7} [Rank 3] Trainer log: {'loss': 0.7316, 'grad_norm': 2.935777187347412, 'learning_rate': 4.531419837174925e-06} [Rank 2] Trainer log: {'loss': 0.7316, 'grad_norm': 2.935777187347412, 'learning_rate': 4.531419837174925e-06}[Rank 1] Trainer log: {'loss': 0.7316, 'grad_norm': 2.935777187347412, 'learning_rate': 4.531419837174925e-06} [Rank 0] Trainer log: {'loss': 0.7316, 'grad_norm': 2.935777187347412, 'learning_rate': 4.531419837174925e-06} {'loss': 0.7316, 'grad_norm': 2.935777187347412, 'learning_rate': 4.531419837174925e-06, 'epoch': 0.7} [Rank 3] Trainer log: {'loss': 0.9133, 'grad_norm': 2.707061529159546, 'learning_rate': 4.52571192512741e-06} [Rank 0] Trainer log: {'loss': 0.9133, 'grad_norm': 2.707061529159546, 'learning_rate': 4.52571192512741e-06}[Rank 1] Trainer log: {'loss': 0.9133, 'grad_norm': 2.707061529159546, 'learning_rate': 4.52571192512741e-06} [Rank 2] Trainer log: {'loss': 0.9133, 'grad_norm': 2.707061529159546, 'learning_rate': 4.52571192512741e-06} {'loss': 0.9133, 'grad_norm': 2.707061529159546, 'learning_rate': 4.52571192512741e-06, 'epoch': 0.7} [Rank 0] Trainer log: {'loss': 0.768, 'grad_norm': 1.9760475158691406, 'learning_rate': 4.520006558684524e-06}[Rank 3] Trainer log: {'loss': 0.768, 'grad_norm': 1.9760475158691406, 'learning_rate': 4.520006558684524e-06}[Rank 2] Trainer log: {'loss': 0.768, 'grad_norm': 1.9760475158691406, 'learning_rate': 4.520006558684524e-06} [Rank 1] Trainer log: {'loss': 0.768, 'grad_norm': 1.9760475158691406, 'learning_rate': 4.520006558684524e-06} {'loss': 0.768, 'grad_norm': 1.9760475158691406, 'learning_rate': 4.520006558684524e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.7441, 'grad_norm': 3.354233503341675, 'learning_rate': 4.514303740499321e-06}[Rank 0] Trainer log: {'loss': 0.7441, 'grad_norm': 3.354233503341675, 'learning_rate': 4.514303740499321e-06}[Rank 3] Trainer log: {'loss': 0.7441, 'grad_norm': 3.354233503341675, 'learning_rate': 4.514303740499321e-06} [Rank 1] Trainer log: {'loss': 0.7441, 'grad_norm': 3.354233503341675, 'learning_rate': 4.514303740499321e-06} {'loss': 0.7441, 'grad_norm': 3.354233503341675, 'learning_rate': 4.514303740499321e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.988, 'grad_norm': 3.5123400688171387, 'learning_rate': 4.508603473223676e-06}[Rank 2] Trainer log: {'loss': 0.988, 'grad_norm': 3.5123400688171387, 'learning_rate': 4.508603473223676e-06} [Rank 3] Trainer log: {'loss': 0.988, 'grad_norm': 3.5123400688171387, 'learning_rate': 4.508603473223676e-06} [Rank 0] Trainer log: {'loss': 0.988, 'grad_norm': 3.5123400688171387, 'learning_rate': 4.508603473223676e-06} {'loss': 0.988, 'grad_norm': 3.5123400688171387, 'learning_rate': 4.508603473223676e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.9513, 'grad_norm': 7.357664585113525, 'learning_rate': 4.502905759508278e-06}[Rank 1] Trainer log: {'loss': 0.9513, 'grad_norm': 7.357664585113525, 'learning_rate': 4.502905759508278e-06} [Rank 0] Trainer log: {'loss': 0.9513, 'grad_norm': 7.357664585113525, 'learning_rate': 4.502905759508278e-06} [Rank 3] Trainer log: {'loss': 0.9513, 'grad_norm': 7.357664585113525, 'learning_rate': 4.502905759508278e-06} {'loss': 0.9513, 'grad_norm': 7.357664585113525, 'learning_rate': 4.502905759508278e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 1.0288, 'grad_norm': 2.138368844985962, 'learning_rate': 4.497210602002629e-06}[Rank 3] Trainer log: {'loss': 1.0288, 'grad_norm': 2.138368844985962, 'learning_rate': 4.497210602002629e-06} [Rank 0] Trainer log: {'loss': 1.0288, 'grad_norm': 2.138368844985962, 'learning_rate': 4.497210602002629e-06}[Rank 2] Trainer log: {'loss': 1.0288, 'grad_norm': 2.138368844985962, 'learning_rate': 4.497210602002629e-06} {'loss': 1.0288, 'grad_norm': 2.138368844985962, 'learning_rate': 4.497210602002629e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.8043, 'grad_norm': 3.749131917953491, 'learning_rate': 4.491518003355035e-06} [Rank 3] Trainer log: {'loss': 0.8043, 'grad_norm': 3.749131917953491, 'learning_rate': 4.491518003355035e-06} [Rank 0] Trainer log: {'loss': 0.8043, 'grad_norm': 3.749131917953491, 'learning_rate': 4.491518003355035e-06}[Rank 1] Trainer log: {'loss': 0.8043, 'grad_norm': 3.749131917953491, 'learning_rate': 4.491518003355035e-06} {'loss': 0.8043, 'grad_norm': 3.749131917953491, 'learning_rate': 4.491518003355035e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.8469, 'grad_norm': 5.992094993591309, 'learning_rate': 4.485827966212626e-06}[Rank 1] Trainer log: {'loss': 0.8469, 'grad_norm': 5.992094993591309, 'learning_rate': 4.485827966212626e-06}[Rank 0] Trainer log: {'loss': 0.8469, 'grad_norm': 5.992094993591309, 'learning_rate': 4.485827966212626e-06} [Rank 3] Trainer log: {'loss': 0.8469, 'grad_norm': 5.992094993591309, 'learning_rate': 4.485827966212626e-06} {'loss': 0.8469, 'grad_norm': 5.992094993591309, 'learning_rate': 4.485827966212626e-06, 'epoch': 0.7} [Rank 3] Trainer log: {'loss': 0.935, 'grad_norm': 5.382296562194824, 'learning_rate': 4.480140493221325e-06} [Rank 2] Trainer log: {'loss': 0.935, 'grad_norm': 5.382296562194824, 'learning_rate': 4.480140493221325e-06} [Rank 0] Trainer log: {'loss': 0.935, 'grad_norm': 5.382296562194824, 'learning_rate': 4.480140493221325e-06}[Rank 1] Trainer log: {'loss': 0.935, 'grad_norm': 5.382296562194824, 'learning_rate': 4.480140493221325e-06} {'loss': 0.935, 'grad_norm': 5.382296562194824, 'learning_rate': 4.480140493221325e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.9116, 'grad_norm': 10.943899154663086, 'learning_rate': 4.47445558702587e-06} [Rank 1] Trainer log: {'loss': 0.9116, 'grad_norm': 10.943899154663086, 'learning_rate': 4.47445558702587e-06} [Rank 0] Trainer log: {'loss': 0.9116, 'grad_norm': 10.943899154663086, 'learning_rate': 4.47445558702587e-06}[Rank 3] Trainer log: {'loss': 0.9116, 'grad_norm': 10.943899154663086, 'learning_rate': 4.47445558702587e-06} {'loss': 0.9116, 'grad_norm': 10.943899154663086, 'learning_rate': 4.47445558702587e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.7625, 'grad_norm': 3.8745052814483643, 'learning_rate': 4.46877325026981e-06}[Rank 0] Trainer log: {'loss': 0.7625, 'grad_norm': 3.8745052814483643, 'learning_rate': 4.46877325026981e-06} [Rank 3] Trainer log: {'loss': 0.7625, 'grad_norm': 3.8745052814483643, 'learning_rate': 4.46877325026981e-06}[Rank 2] Trainer log: {'loss': 0.7625, 'grad_norm': 3.8745052814483643, 'learning_rate': 4.46877325026981e-06} {'loss': 0.7625, 'grad_norm': 3.8745052814483643, 'learning_rate': 4.46877325026981e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.6959, 'grad_norm': 3.0605251789093018, 'learning_rate': 4.463093485595492e-06}[Rank 1] Trainer log: {'loss': 0.6959, 'grad_norm': 3.0605251789093018, 'learning_rate': 4.463093485595492e-06}[Rank 3] Trainer log: {'loss': 0.6959, 'grad_norm': 3.0605251789093018, 'learning_rate': 4.463093485595492e-06} [Rank 0] Trainer log: {'loss': 0.6959, 'grad_norm': 3.0605251789093018, 'learning_rate': 4.463093485595492e-06} {'loss': 0.6959, 'grad_norm': 3.0605251789093018, 'learning_rate': 4.463093485595492e-06, 'epoch': 0.7} [Rank 3] Trainer log: {'loss': 0.8313, 'grad_norm': 9.662293434143066, 'learning_rate': 4.457416295644067e-06}[Rank 0] Trainer log: {'loss': 0.8313, 'grad_norm': 9.662293434143066, 'learning_rate': 4.457416295644067e-06} [Rank 2] Trainer log: {'loss': 0.8313, 'grad_norm': 9.662293434143066, 'learning_rate': 4.457416295644067e-06}[Rank 1] Trainer log: {'loss': 0.8313, 'grad_norm': 9.662293434143066, 'learning_rate': 4.457416295644067e-06} {'loss': 0.8313, 'grad_norm': 9.662293434143066, 'learning_rate': 4.457416295644067e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 1.0735, 'grad_norm': 2.076378107070923, 'learning_rate': 4.451741683055492e-06} [Rank 2] Trainer log: {'loss': 1.0735, 'grad_norm': 2.076378107070923, 'learning_rate': 4.451741683055492e-06} [Rank 3] Trainer log: {'loss': 1.0735, 'grad_norm': 2.076378107070923, 'learning_rate': 4.451741683055492e-06} [Rank 0] Trainer log: {'loss': 1.0735, 'grad_norm': 2.076378107070923, 'learning_rate': 4.451741683055492e-06} {'loss': 1.0735, 'grad_norm': 2.076378107070923, 'learning_rate': 4.451741683055492e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 1.0284, 'grad_norm': 3.388908624649048, 'learning_rate': 4.4460696504685315e-06}[Rank 0] Trainer log: {'loss': 1.0284, 'grad_norm': 3.388908624649048, 'learning_rate': 4.4460696504685315e-06}[Rank 1] Trainer log: {'loss': 1.0284, 'grad_norm': 3.388908624649048, 'learning_rate': 4.4460696504685315e-06} [Rank 3] Trainer log: {'loss': 1.0284, 'grad_norm': 3.388908624649048, 'learning_rate': 4.4460696504685315e-06} {'loss': 1.0284, 'grad_norm': 3.388908624649048, 'learning_rate': 4.4460696504685315e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.7598, 'grad_norm': 5.332174777984619, 'learning_rate': 4.4404002005207344e-06}[Rank 3] Trainer log: {'loss': 0.7598, 'grad_norm': 5.332174777984619, 'learning_rate': 4.4404002005207344e-06}[Rank 0] Trainer log: {'loss': 0.7598, 'grad_norm': 5.332174777984619, 'learning_rate': 4.4404002005207344e-06} [Rank 1] Trainer log: {'loss': 0.7598, 'grad_norm': 5.332174777984619, 'learning_rate': 4.4404002005207344e-06} {'loss': 0.7598, 'grad_norm': 5.332174777984619, 'learning_rate': 4.4404002005207344e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.6606, 'grad_norm': 2.0152459144592285, 'learning_rate': 4.434733335848466e-06}[Rank 2] Trainer log: {'loss': 0.6606, 'grad_norm': 2.0152459144592285, 'learning_rate': 4.434733335848466e-06}[Rank 3] Trainer log: {'loss': 0.6606, 'grad_norm': 2.0152459144592285, 'learning_rate': 4.434733335848466e-06} [Rank 0] Trainer log: {'loss': 0.6606, 'grad_norm': 2.0152459144592285, 'learning_rate': 4.434733335848466e-06} {'loss': 0.6606, 'grad_norm': 2.0152459144592285, 'learning_rate': 4.434733335848466e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.5878, 'grad_norm': 12.835857391357422, 'learning_rate': 4.429069059086877e-06}[Rank 3] Trainer log: {'loss': 0.5878, 'grad_norm': 12.835857391357422, 'learning_rate': 4.429069059086877e-06}[Rank 1] Trainer log: {'loss': 0.5878, 'grad_norm': 12.835857391357422, 'learning_rate': 4.429069059086877e-06} [Rank 0] Trainer log: {'loss': 0.5878, 'grad_norm': 12.835857391357422, 'learning_rate': 4.429069059086877e-06} {'loss': 0.5878, 'grad_norm': 12.835857391357422, 'learning_rate': 4.429069059086877e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.8597, 'grad_norm': 3.591714382171631, 'learning_rate': 4.423407372869915e-06} [Rank 0] Trainer log: {'loss': 0.8597, 'grad_norm': 3.591714382171631, 'learning_rate': 4.423407372869915e-06}[Rank 3] Trainer log: {'loss': 0.8597, 'grad_norm': 3.591714382171631, 'learning_rate': 4.423407372869915e-06} [Rank 1] Trainer log: {'loss': 0.8597, 'grad_norm': 3.591714382171631, 'learning_rate': 4.423407372869915e-06} {'loss': 0.8597, 'grad_norm': 3.591714382171631, 'learning_rate': 4.423407372869915e-06, 'epoch': 0.7} [Rank 0] Trainer log: {'loss': 0.7108, 'grad_norm': 4.875146865844727, 'learning_rate': 4.417748279830334e-06}[Rank 1] Trainer log: {'loss': 0.7108, 'grad_norm': 4.875146865844727, 'learning_rate': 4.417748279830334e-06} [Rank 2] Trainer log: {'loss': 0.7108, 'grad_norm': 4.875146865844727, 'learning_rate': 4.417748279830334e-06} [Rank 3] Trainer log: {'loss': 0.7108, 'grad_norm': 4.875146865844727, 'learning_rate': 4.417748279830334e-06} {'loss': 0.7108, 'grad_norm': 4.875146865844727, 'learning_rate': 4.417748279830334e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.8793, 'grad_norm': 6.304074287414551, 'learning_rate': 4.412091782599672e-06}[Rank 1] Trainer log: {'loss': 0.8793, 'grad_norm': 6.304074287414551, 'learning_rate': 4.412091782599672e-06}[Rank 3] Trainer log: {'loss': 0.8793, 'grad_norm': 6.304074287414551, 'learning_rate': 4.412091782599672e-06}[Rank 0] Trainer log: {'loss': 0.8793, 'grad_norm': 6.304074287414551, 'learning_rate': 4.412091782599672e-06} {'loss': 0.8793, 'grad_norm': 6.304074287414551, 'learning_rate': 4.412091782599672e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.8302, 'grad_norm': 5.600428104400635, 'learning_rate': 4.40643788380826e-06}[Rank 0] Trainer log: {'loss': 0.8302, 'grad_norm': 5.600428104400635, 'learning_rate': 4.40643788380826e-06}[Rank 1] Trainer log: {'loss': 0.8302, 'grad_norm': 5.600428104400635, 'learning_rate': 4.40643788380826e-06} [Rank 3] Trainer log: {'loss': 0.8302, 'grad_norm': 5.600428104400635, 'learning_rate': 4.40643788380826e-06} {'loss': 0.8302, 'grad_norm': 5.600428104400635, 'learning_rate': 4.40643788380826e-06, 'epoch': 0.7} [Rank 1] Trainer log: {'loss': 0.6803, 'grad_norm': 17.221923828125, 'learning_rate': 4.4007865860852265e-06}[Rank 3] Trainer log: {'loss': 0.6803, 'grad_norm': 17.221923828125, 'learning_rate': 4.4007865860852265e-06} [Rank 0] Trainer log: {'loss': 0.6803, 'grad_norm': 17.221923828125, 'learning_rate': 4.4007865860852265e-06}[Rank 2] Trainer log: {'loss': 0.6803, 'grad_norm': 17.221923828125, 'learning_rate': 4.4007865860852265e-06} {'loss': 0.6803, 'grad_norm': 17.221923828125, 'learning_rate': 4.4007865860852265e-06, 'epoch': 0.7} [Rank 0] Trainer log: {'loss': 0.5219, 'grad_norm': 6.041540622711182, 'learning_rate': 4.39513789205849e-06}[Rank 1] Trainer log: {'loss': 0.5219, 'grad_norm': 6.041540622711182, 'learning_rate': 4.39513789205849e-06}[Rank 3] Trainer log: {'loss': 0.5219, 'grad_norm': 6.041540622711182, 'learning_rate': 4.39513789205849e-06} [Rank 2] Trainer log: {'loss': 0.5219, 'grad_norm': 6.041540622711182, 'learning_rate': 4.39513789205849e-06} {'loss': 0.5219, 'grad_norm': 6.041540622711182, 'learning_rate': 4.39513789205849e-06, 'epoch': 0.7} [Rank 2] Trainer log: {'loss': 0.4723, 'grad_norm': 18.969934463500977, 'learning_rate': 4.38949180435475e-06}[Rank 1] Trainer log: {'loss': 0.4723, 'grad_norm': 18.969934463500977, 'learning_rate': 4.38949180435475e-06} [Rank 0] Trainer log: {'loss': 0.4723, 'grad_norm': 18.969934463500977, 'learning_rate': 4.38949180435475e-06}[Rank 3] Trainer log: {'loss': 0.4723, 'grad_norm': 18.969934463500977, 'learning_rate': 4.38949180435475e-06} {'loss': 0.4723, 'grad_norm': 18.969934463500977, 'learning_rate': 4.38949180435475e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7503, 'grad_norm': 5.131930828094482, 'learning_rate': 4.3838483255995055e-06}[Rank 1] Trainer log: {'loss': 0.7503, 'grad_norm': 5.131930828094482, 'learning_rate': 4.3838483255995055e-06}[Rank 3] Trainer log: {'loss': 0.7503, 'grad_norm': 5.131930828094482, 'learning_rate': 4.3838483255995055e-06} [Rank 0] Trainer log: {'loss': 0.7503, 'grad_norm': 5.131930828094482, 'learning_rate': 4.3838483255995055e-06} {'loss': 0.7503, 'grad_norm': 5.131930828094482, 'learning_rate': 4.3838483255995055e-06, 'epoch': 0.71} [Rank 3] Trainer log: {'loss': 0.7169, 'grad_norm': 11.759084701538086, 'learning_rate': 4.378207458417035e-06}[Rank 2] Trainer log: {'loss': 0.7169, 'grad_norm': 11.759084701538086, 'learning_rate': 4.378207458417035e-06}[Rank 1] Trainer log: {'loss': 0.7169, 'grad_norm': 11.759084701538086, 'learning_rate': 4.378207458417035e-06} [Rank 0] Trainer log: {'loss': 0.7169, 'grad_norm': 11.759084701538086, 'learning_rate': 4.378207458417035e-06} {'loss': 0.7169, 'grad_norm': 11.759084701538086, 'learning_rate': 4.378207458417035e-06, 'epoch': 0.71} [Rank 1] Trainer log: {'loss': 0.9119, 'grad_norm': 4.288745880126953, 'learning_rate': 4.372569205430401e-06}[Rank 2] Trainer log: {'loss': 0.9119, 'grad_norm': 4.288745880126953, 'learning_rate': 4.372569205430401e-06} [Rank 3] Trainer log: {'loss': 0.9119, 'grad_norm': 4.288745880126953, 'learning_rate': 4.372569205430401e-06} [Rank 0] Trainer log: {'loss': 0.9119, 'grad_norm': 4.288745880126953, 'learning_rate': 4.372569205430401e-06} {'loss': 0.9119, 'grad_norm': 4.288745880126953, 'learning_rate': 4.372569205430401e-06, 'epoch': 0.71} [Rank 1] Trainer log: {'loss': 0.8918, 'grad_norm': 9.084725379943848, 'learning_rate': 4.366933569261458e-06}[Rank 3] Trainer log: {'loss': 0.8918, 'grad_norm': 9.084725379943848, 'learning_rate': 4.366933569261458e-06}[Rank 2] Trainer log: {'loss': 0.8918, 'grad_norm': 9.084725379943848, 'learning_rate': 4.366933569261458e-06} [Rank 0] Trainer log: {'loss': 0.8918, 'grad_norm': 9.084725379943848, 'learning_rate': 4.366933569261458e-06} {'loss': 0.8918, 'grad_norm': 9.084725379943848, 'learning_rate': 4.366933569261458e-06, 'epoch': 0.71} [Rank 3] Trainer log: {'loss': 0.842, 'grad_norm': 5.443936347961426, 'learning_rate': 4.3613005525308385e-06} [Rank 0] Trainer log: {'loss': 0.842, 'grad_norm': 5.443936347961426, 'learning_rate': 4.3613005525308385e-06}[Rank 1] Trainer log: {'loss': 0.842, 'grad_norm': 5.443936347961426, 'learning_rate': 4.3613005525308385e-06}[Rank 2] Trainer log: {'loss': 0.842, 'grad_norm': 5.443936347961426, 'learning_rate': 4.3613005525308385e-06} {'loss': 0.842, 'grad_norm': 5.443936347961426, 'learning_rate': 4.3613005525308385e-06, 'epoch': 0.71} [Rank 1] Trainer log: {'loss': 0.7251, 'grad_norm': 3.509927749633789, 'learning_rate': 4.355670157857952e-06}[Rank 3] Trainer log: {'loss': 0.7251, 'grad_norm': 3.509927749633789, 'learning_rate': 4.355670157857952e-06}[Rank 2] Trainer log: {'loss': 0.7251, 'grad_norm': 3.509927749633789, 'learning_rate': 4.355670157857952e-06} [Rank 0] Trainer log: {'loss': 0.7251, 'grad_norm': 3.509927749633789, 'learning_rate': 4.355670157857952e-06} {'loss': 0.7251, 'grad_norm': 3.509927749633789, 'learning_rate': 4.355670157857952e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.9801, 'grad_norm': 5.8687238693237305, 'learning_rate': 4.350042387861e-06}[Rank 3] Trainer log: {'loss': 0.9801, 'grad_norm': 5.8687238693237305, 'learning_rate': 4.350042387861e-06} [Rank 1] Trainer log: {'loss': 0.9801, 'grad_norm': 5.8687238693237305, 'learning_rate': 4.350042387861e-06} [Rank 0] Trainer log: {'loss': 0.9801, 'grad_norm': 5.8687238693237305, 'learning_rate': 4.350042387861e-06} {'loss': 0.9801, 'grad_norm': 5.8687238693237305, 'learning_rate': 4.350042387861e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 1.0874, 'grad_norm': 3.0018253326416016, 'learning_rate': 4.34441724515696e-06} [Rank 3] Trainer log: {'loss': 1.0874, 'grad_norm': 3.0018253326416016, 'learning_rate': 4.34441724515696e-06} [Rank 0] Trainer log: {'loss': 1.0874, 'grad_norm': 3.0018253326416016, 'learning_rate': 4.34441724515696e-06} [Rank 1] Trainer log: {'loss': 1.0874, 'grad_norm': 3.0018253326416016, 'learning_rate': 4.34441724515696e-06} {'loss': 1.0874, 'grad_norm': 3.0018253326416016, 'learning_rate': 4.34441724515696e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.9321, 'grad_norm': 5.058714389801025, 'learning_rate': 4.338794732361582e-06} [Rank 1] Trainer log: {'loss': 0.9321, 'grad_norm': 5.058714389801025, 'learning_rate': 4.338794732361582e-06} [Rank 0] Trainer log: {'loss': 0.9321, 'grad_norm': 5.058714389801025, 'learning_rate': 4.338794732361582e-06}[Rank 3] Trainer log: {'loss': 0.9321, 'grad_norm': 5.058714389801025, 'learning_rate': 4.338794732361582e-06} {'loss': 0.9321, 'grad_norm': 5.058714389801025, 'learning_rate': 4.338794732361582e-06, 'epoch': 0.71} [Rank 0] Trainer log: {'loss': 0.8953, 'grad_norm': 4.004299163818359, 'learning_rate': 4.333174852089394e-06}[Rank 3] Trainer log: {'loss': 0.8953, 'grad_norm': 4.004299163818359, 'learning_rate': 4.333174852089394e-06}[Rank 2] Trainer log: {'loss': 0.8953, 'grad_norm': 4.004299163818359, 'learning_rate': 4.333174852089394e-06} [Rank 1] Trainer log: {'loss': 0.8953, 'grad_norm': 4.004299163818359, 'learning_rate': 4.333174852089394e-06} {'loss': 0.8953, 'grad_norm': 4.004299163818359, 'learning_rate': 4.333174852089394e-06, 'epoch': 0.71} [Rank 3] Trainer log: {'loss': 0.7857, 'grad_norm': 6.520051002502441, 'learning_rate': 4.327557606953712e-06}[Rank 2] Trainer log: {'loss': 0.7857, 'grad_norm': 6.520051002502441, 'learning_rate': 4.327557606953712e-06} [Rank 1] Trainer log: {'loss': 0.7857, 'grad_norm': 6.520051002502441, 'learning_rate': 4.327557606953712e-06} [Rank 0] Trainer log: {'loss': 0.7857, 'grad_norm': 6.520051002502441, 'learning_rate': 4.327557606953712e-06} {'loss': 0.7857, 'grad_norm': 6.520051002502441, 'learning_rate': 4.327557606953712e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.531, 'grad_norm': 8.305449485778809, 'learning_rate': 4.321942999566608e-06}[Rank 3] Trainer log: {'loss': 0.531, 'grad_norm': 8.305449485778809, 'learning_rate': 4.321942999566608e-06} [Rank 0] Trainer log: {'loss': 0.531, 'grad_norm': 8.305449485778809, 'learning_rate': 4.321942999566608e-06} [Rank 1] Trainer log: {'loss': 0.531, 'grad_norm': 8.305449485778809, 'learning_rate': 4.321942999566608e-06} {'loss': 0.531, 'grad_norm': 8.305449485778809, 'learning_rate': 4.321942999566608e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.8224, 'grad_norm': 4.908524036407471, 'learning_rate': 4.316331032538939e-06} [Rank 0] Trainer log: {'loss': 0.8224, 'grad_norm': 4.908524036407471, 'learning_rate': 4.316331032538939e-06} [Rank 3] Trainer log: {'loss': 0.8224, 'grad_norm': 4.908524036407471, 'learning_rate': 4.316331032538939e-06} [Rank 1] Trainer log: {'loss': 0.8224, 'grad_norm': 4.908524036407471, 'learning_rate': 4.316331032538939e-06} {'loss': 0.8224, 'grad_norm': 4.908524036407471, 'learning_rate': 4.316331032538939e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7245, 'grad_norm': 2.746137857437134, 'learning_rate': 4.310721708480334e-06}[Rank 0] Trainer log: {'loss': 0.7245, 'grad_norm': 2.746137857437134, 'learning_rate': 4.310721708480334e-06}[Rank 1] Trainer log: {'loss': 0.7245, 'grad_norm': 2.746137857437134, 'learning_rate': 4.310721708480334e-06} [Rank 3] Trainer log: {'loss': 0.7245, 'grad_norm': 2.746137857437134, 'learning_rate': 4.310721708480334e-06} {'loss': 0.7245, 'grad_norm': 2.746137857437134, 'learning_rate': 4.310721708480334e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.646, 'grad_norm': 9.856644630432129, 'learning_rate': 4.305115029999187e-06}[Rank 0] Trainer log: {'loss': 0.646, 'grad_norm': 9.856644630432129, 'learning_rate': 4.305115029999187e-06}[Rank 3] Trainer log: {'loss': 0.646, 'grad_norm': 9.856644630432129, 'learning_rate': 4.305115029999187e-06} [Rank 1] Trainer log: {'loss': 0.646, 'grad_norm': 9.856644630432129, 'learning_rate': 4.305115029999187e-06} {'loss': 0.646, 'grad_norm': 9.856644630432129, 'learning_rate': 4.305115029999187e-06, 'epoch': 0.71} [Rank 3] Trainer log: {'loss': 0.8179, 'grad_norm': 4.005688667297363, 'learning_rate': 4.29951099970267e-06}[Rank 2] Trainer log: {'loss': 0.8179, 'grad_norm': 4.005688667297363, 'learning_rate': 4.29951099970267e-06} [Rank 1] Trainer log: {'loss': 0.8179, 'grad_norm': 4.005688667297363, 'learning_rate': 4.29951099970267e-06} [Rank 0] Trainer log: {'loss': 0.8179, 'grad_norm': 4.005688667297363, 'learning_rate': 4.29951099970267e-06} {'loss': 0.8179, 'grad_norm': 4.005688667297363, 'learning_rate': 4.29951099970267e-06, 'epoch': 0.71} [Rank 1] Trainer log: {'loss': 0.6597, 'grad_norm': 10.505650520324707, 'learning_rate': 4.2939096201967186e-06}[Rank 0] Trainer log: {'loss': 0.6597, 'grad_norm': 10.505650520324707, 'learning_rate': 4.2939096201967186e-06}[Rank 2] Trainer log: {'loss': 0.6597, 'grad_norm': 10.505650520324707, 'learning_rate': 4.2939096201967186e-06} [Rank 3] Trainer log: {'loss': 0.6597, 'grad_norm': 10.505650520324707, 'learning_rate': 4.2939096201967186e-06} {'loss': 0.6597, 'grad_norm': 10.505650520324707, 'learning_rate': 4.2939096201967186e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7809, 'grad_norm': 2.701481342315674, 'learning_rate': 4.2883108940860365e-06} [Rank 3] Trainer log: {'loss': 0.7809, 'grad_norm': 2.701481342315674, 'learning_rate': 4.2883108940860365e-06}[Rank 1] Trainer log: {'loss': 0.7809, 'grad_norm': 2.701481342315674, 'learning_rate': 4.2883108940860365e-06} [Rank 0] Trainer log: {'loss': 0.7809, 'grad_norm': 2.701481342315674, 'learning_rate': 4.2883108940860365e-06} {'loss': 0.7809, 'grad_norm': 2.701481342315674, 'learning_rate': 4.2883108940860365e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.9932, 'grad_norm': 5.332484722137451, 'learning_rate': 4.282714823974088e-06}[Rank 3] Trainer log: {'loss': 0.9932, 'grad_norm': 5.332484722137451, 'learning_rate': 4.282714823974088e-06} [Rank 0] Trainer log: {'loss': 0.9932, 'grad_norm': 5.332484722137451, 'learning_rate': 4.282714823974088e-06} [Rank 1] Trainer log: {'loss': 0.9932, 'grad_norm': 5.332484722137451, 'learning_rate': 4.282714823974088e-06} {'loss': 0.9932, 'grad_norm': 5.332484722137451, 'learning_rate': 4.282714823974088e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.6095, 'grad_norm': 1.8313478231430054, 'learning_rate': 4.2771214124631155e-06} [Rank 3] Trainer log: {'loss': 0.6095, 'grad_norm': 1.8313478231430054, 'learning_rate': 4.2771214124631155e-06} [Rank 0] Trainer log: {'loss': 0.6095, 'grad_norm': 1.8313478231430054, 'learning_rate': 4.2771214124631155e-06}[Rank 1] Trainer log: {'loss': 0.6095, 'grad_norm': 1.8313478231430054, 'learning_rate': 4.2771214124631155e-06} {'loss': 0.6095, 'grad_norm': 1.8313478231430054, 'learning_rate': 4.2771214124631155e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7147, 'grad_norm': 7.970254421234131, 'learning_rate': 4.271530662154114e-06}[Rank 0] Trainer log: {'loss': 0.7147, 'grad_norm': 7.970254421234131, 'learning_rate': 4.271530662154114e-06}[Rank 3] Trainer log: {'loss': 0.7147, 'grad_norm': 7.970254421234131, 'learning_rate': 4.271530662154114e-06} [Rank 1] Trainer log: {'loss': 0.7147, 'grad_norm': 7.970254421234131, 'learning_rate': 4.271530662154114e-06} {'loss': 0.7147, 'grad_norm': 7.970254421234131, 'learning_rate': 4.271530662154114e-06, 'epoch': 0.71} [Rank 1] Trainer log: {'loss': 0.6628, 'grad_norm': 5.874927520751953, 'learning_rate': 4.265942575646841e-06}[Rank 2] Trainer log: {'loss': 0.6628, 'grad_norm': 5.874927520751953, 'learning_rate': 4.265942575646841e-06}[Rank 0] Trainer log: {'loss': 0.6628, 'grad_norm': 5.874927520751953, 'learning_rate': 4.265942575646841e-06} [Rank 3] Trainer log: {'loss': 0.6628, 'grad_norm': 5.874927520751953, 'learning_rate': 4.265942575646841e-06} {'loss': 0.6628, 'grad_norm': 5.874927520751953, 'learning_rate': 4.265942575646841e-06, 'epoch': 0.71} [Rank 0] Trainer log: {'loss': 0.9094, 'grad_norm': 3.347580671310425, 'learning_rate': 4.2603571555398205e-06}[Rank 2] Trainer log: {'loss': 0.9094, 'grad_norm': 3.347580671310425, 'learning_rate': 4.2603571555398205e-06} [Rank 3] Trainer log: {'loss': 0.9094, 'grad_norm': 3.347580671310425, 'learning_rate': 4.2603571555398205e-06} [Rank 1] Trainer log: {'loss': 0.9094, 'grad_norm': 3.347580671310425, 'learning_rate': 4.2603571555398205e-06} {'loss': 0.9094, 'grad_norm': 3.347580671310425, 'learning_rate': 4.2603571555398205e-06, 'epoch': 0.71} [Rank 3] Trainer log: {'loss': 0.7375, 'grad_norm': 3.3814308643341064, 'learning_rate': 4.25477440443034e-06}[Rank 1] Trainer log: {'loss': 0.7375, 'grad_norm': 3.3814308643341064, 'learning_rate': 4.25477440443034e-06}[Rank 2] Trainer log: {'loss': 0.7375, 'grad_norm': 3.3814308643341064, 'learning_rate': 4.25477440443034e-06} [Rank 0] Trainer log: {'loss': 0.7375, 'grad_norm': 3.3814308643341064, 'learning_rate': 4.25477440443034e-06} {'loss': 0.7375, 'grad_norm': 3.3814308643341064, 'learning_rate': 4.25477440443034e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7251, 'grad_norm': 2.0841076374053955, 'learning_rate': 4.249194324914433e-06} [Rank 3] Trainer log: {'loss': 0.7251, 'grad_norm': 2.0841076374053955, 'learning_rate': 4.249194324914433e-06}[Rank 1] Trainer log: {'loss': 0.7251, 'grad_norm': 2.0841076374053955, 'learning_rate': 4.249194324914433e-06} [Rank 0] Trainer log: {'loss': 0.7251, 'grad_norm': 2.0841076374053955, 'learning_rate': 4.249194324914433e-06} {'loss': 0.7251, 'grad_norm': 2.0841076374053955, 'learning_rate': 4.249194324914433e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.8554, 'grad_norm': 2.978841781616211, 'learning_rate': 4.243616919586906e-06}[Rank 3] Trainer log: {'loss': 0.8554, 'grad_norm': 2.978841781616211, 'learning_rate': 4.243616919586906e-06} [Rank 0] Trainer log: {'loss': 0.8554, 'grad_norm': 2.978841781616211, 'learning_rate': 4.243616919586906e-06}[Rank 1] Trainer log: {'loss': 0.8554, 'grad_norm': 2.978841781616211, 'learning_rate': 4.243616919586906e-06} {'loss': 0.8554, 'grad_norm': 2.978841781616211, 'learning_rate': 4.243616919586906e-06, 'epoch': 0.71} [Rank 3] Trainer log: {'loss': 1.0772, 'grad_norm': 2.023512363433838, 'learning_rate': 4.238042191041309e-06}[Rank 2] Trainer log: {'loss': 1.0772, 'grad_norm': 2.023512363433838, 'learning_rate': 4.238042191041309e-06} [Rank 0] Trainer log: {'loss': 1.0772, 'grad_norm': 2.023512363433838, 'learning_rate': 4.238042191041309e-06}[Rank 1] Trainer log: {'loss': 1.0772, 'grad_norm': 2.023512363433838, 'learning_rate': 4.238042191041309e-06} {'loss': 1.0772, 'grad_norm': 2.023512363433838, 'learning_rate': 4.238042191041309e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.9155, 'grad_norm': 2.8516080379486084, 'learning_rate': 4.232470141869952e-06} [Rank 3] Trainer log: {'loss': 0.9155, 'grad_norm': 2.8516080379486084, 'learning_rate': 4.232470141869952e-06} [Rank 0] Trainer log: {'loss': 0.9155, 'grad_norm': 2.8516080379486084, 'learning_rate': 4.232470141869952e-06}[Rank 1] Trainer log: {'loss': 0.9155, 'grad_norm': 2.8516080379486084, 'learning_rate': 4.232470141869952e-06} {'loss': 0.9155, 'grad_norm': 2.8516080379486084, 'learning_rate': 4.232470141869952e-06, 'epoch': 0.71} [Rank 0] Trainer log: {'loss': 0.6791, 'grad_norm': 2.2753679752349854, 'learning_rate': 4.226900774663903e-06}[Rank 3] Trainer log: {'loss': 0.6791, 'grad_norm': 2.2753679752349854, 'learning_rate': 4.226900774663903e-06}[Rank 2] Trainer log: {'loss': 0.6791, 'grad_norm': 2.2753679752349854, 'learning_rate': 4.226900774663903e-06} [Rank 1] Trainer log: {'loss': 0.6791, 'grad_norm': 2.2753679752349854, 'learning_rate': 4.226900774663903e-06} {'loss': 0.6791, 'grad_norm': 2.2753679752349854, 'learning_rate': 4.226900774663903e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.9834, 'grad_norm': 4.53400993347168, 'learning_rate': 4.22133409201298e-06}[Rank 0] Trainer log: {'loss': 0.9834, 'grad_norm': 4.53400993347168, 'learning_rate': 4.22133409201298e-06}[Rank 3] Trainer log: {'loss': 0.9834, 'grad_norm': 4.53400993347168, 'learning_rate': 4.22133409201298e-06} [Rank 1] Trainer log: {'loss': 0.9834, 'grad_norm': 4.53400993347168, 'learning_rate': 4.22133409201298e-06} {'loss': 0.9834, 'grad_norm': 4.53400993347168, 'learning_rate': 4.22133409201298e-06, 'epoch': 0.71} [Rank 3] Trainer log: {'loss': 0.6915, 'grad_norm': 11.448282241821289, 'learning_rate': 4.215770096505745e-06}[Rank 2] Trainer log: {'loss': 0.6915, 'grad_norm': 11.448282241821289, 'learning_rate': 4.215770096505745e-06}[Rank 0] Trainer log: {'loss': 0.6915, 'grad_norm': 11.448282241821289, 'learning_rate': 4.215770096505745e-06} [Rank 1] Trainer log: {'loss': 0.6915, 'grad_norm': 11.448282241821289, 'learning_rate': 4.215770096505745e-06} {'loss': 0.6915, 'grad_norm': 11.448282241821289, 'learning_rate': 4.215770096505745e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7634, 'grad_norm': 4.264327526092529, 'learning_rate': 4.210208790729523e-06}[Rank 0] Trainer log: {'loss': 0.7634, 'grad_norm': 4.264327526092529, 'learning_rate': 4.210208790729523e-06} [Rank 3] Trainer log: {'loss': 0.7634, 'grad_norm': 4.264327526092529, 'learning_rate': 4.210208790729523e-06} [Rank 1] Trainer log: {'loss': 0.7634, 'grad_norm': 4.264327526092529, 'learning_rate': 4.210208790729523e-06} {'loss': 0.7634, 'grad_norm': 4.264327526092529, 'learning_rate': 4.210208790729523e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.5659, 'grad_norm': 8.55467414855957, 'learning_rate': 4.204650177270386e-06} [Rank 3] Trainer log: {'loss': 0.5659, 'grad_norm': 8.55467414855957, 'learning_rate': 4.204650177270386e-06}[Rank 0] Trainer log: {'loss': 0.5659, 'grad_norm': 8.55467414855957, 'learning_rate': 4.204650177270386e-06} [Rank 1] Trainer log: {'loss': 0.5659, 'grad_norm': 8.55467414855957, 'learning_rate': 4.204650177270386e-06} {'loss': 0.5659, 'grad_norm': 8.55467414855957, 'learning_rate': 4.204650177270386e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7743, 'grad_norm': 5.354011058807373, 'learning_rate': 4.199094258713147e-06} [Rank 0] Trainer log: {'loss': 0.7743, 'grad_norm': 5.354011058807373, 'learning_rate': 4.199094258713147e-06}[Rank 1] Trainer log: {'loss': 0.7743, 'grad_norm': 5.354011058807373, 'learning_rate': 4.199094258713147e-06} [Rank 3] Trainer log: {'loss': 0.7743, 'grad_norm': 5.354011058807373, 'learning_rate': 4.199094258713147e-06} {'loss': 0.7743, 'grad_norm': 5.354011058807373, 'learning_rate': 4.199094258713147e-06, 'epoch': 0.71} [Rank 0] Trainer log: {'loss': 0.7208, 'grad_norm': 4.43241548538208, 'learning_rate': 4.193541037641367e-06}[Rank 2] Trainer log: {'loss': 0.7208, 'grad_norm': 4.43241548538208, 'learning_rate': 4.193541037641367e-06}[Rank 3] Trainer log: {'loss': 0.7208, 'grad_norm': 4.43241548538208, 'learning_rate': 4.193541037641367e-06} [Rank 1] Trainer log: {'loss': 0.7208, 'grad_norm': 4.43241548538208, 'learning_rate': 4.193541037641367e-06} {'loss': 0.7208, 'grad_norm': 4.43241548538208, 'learning_rate': 4.193541037641367e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7176, 'grad_norm': 3.427935838699341, 'learning_rate': 4.187990516637361e-06} [Rank 3] Trainer log: {'loss': 0.7176, 'grad_norm': 3.427935838699341, 'learning_rate': 4.187990516637361e-06} [Rank 1] Trainer log: {'loss': 0.7176, 'grad_norm': 3.427935838699341, 'learning_rate': 4.187990516637361e-06}[Rank 0] Trainer log: {'loss': 0.7176, 'grad_norm': 3.427935838699341, 'learning_rate': 4.187990516637361e-06} {'loss': 0.7176, 'grad_norm': 3.427935838699341, 'learning_rate': 4.187990516637361e-06, 'epoch': 0.71} [Rank 1] Trainer log: {'loss': 0.5275, 'grad_norm': 3.3254170417785645, 'learning_rate': 4.182442698282181e-06}[Rank 0] Trainer log: {'loss': 0.5275, 'grad_norm': 3.3254170417785645, 'learning_rate': 4.182442698282181e-06}[Rank 3] Trainer log: {'loss': 0.5275, 'grad_norm': 3.3254170417785645, 'learning_rate': 4.182442698282181e-06} {'loss': 0.5275, 'grad_norm': 3.3254170417785645, 'learning_rate': 4.182442698282181e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.5275, 'grad_norm': 3.3254170417785645, 'learning_rate': 4.182442698282181e-06} [Rank 3] Trainer log: {'loss': 0.8493, 'grad_norm': 5.9348673820495605, 'learning_rate': 4.176897585155619e-06}[Rank 2] Trainer log: {'loss': 0.8493, 'grad_norm': 5.9348673820495605, 'learning_rate': 4.176897585155619e-06}[Rank 1] Trainer log: {'loss': 0.8493, 'grad_norm': 5.9348673820495605, 'learning_rate': 4.176897585155619e-06} [Rank 0] Trainer log: {'loss': 0.8493, 'grad_norm': 5.9348673820495605, 'learning_rate': 4.176897585155619e-06} {'loss': 0.8493, 'grad_norm': 5.9348673820495605, 'learning_rate': 4.176897585155619e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.8662, 'grad_norm': 6.233503341674805, 'learning_rate': 4.171355179836222e-06}[Rank 3] Trainer log: {'loss': 0.8662, 'grad_norm': 6.233503341674805, 'learning_rate': 4.171355179836222e-06} [Rank 0] Trainer log: {'loss': 0.8662, 'grad_norm': 6.233503341674805, 'learning_rate': 4.171355179836222e-06}[Rank 1] Trainer log: {'loss': 0.8662, 'grad_norm': 6.233503341674805, 'learning_rate': 4.171355179836222e-06} {'loss': 0.8662, 'grad_norm': 6.233503341674805, 'learning_rate': 4.171355179836222e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.8647, 'grad_norm': 5.933842658996582, 'learning_rate': 4.165815484901262e-06}[Rank 3] Trainer log: {'loss': 0.8647, 'grad_norm': 5.933842658996582, 'learning_rate': 4.165815484901262e-06}[Rank 0] Trainer log: {'loss': 0.8647, 'grad_norm': 5.933842658996582, 'learning_rate': 4.165815484901262e-06} [Rank 1] Trainer log: {'loss': 0.8647, 'grad_norm': 5.933842658996582, 'learning_rate': 4.165815484901262e-06} {'loss': 0.8647, 'grad_norm': 5.933842658996582, 'learning_rate': 4.165815484901262e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.4969, 'grad_norm': 1.9792065620422363, 'learning_rate': 4.160278502926763e-06}[Rank 1] Trainer log: {'loss': 0.4969, 'grad_norm': 1.9792065620422363, 'learning_rate': 4.160278502926763e-06} [Rank 3] Trainer log: {'loss': 0.4969, 'grad_norm': 1.9792065620422363, 'learning_rate': 4.160278502926763e-06}[Rank 0] Trainer log: {'loss': 0.4969, 'grad_norm': 1.9792065620422363, 'learning_rate': 4.160278502926763e-06} {'loss': 0.4969, 'grad_norm': 1.9792065620422363, 'learning_rate': 4.160278502926763e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7356, 'grad_norm': 3.876523494720459, 'learning_rate': 4.154744236487485e-06} [Rank 1] Trainer log: {'loss': 0.7356, 'grad_norm': 3.876523494720459, 'learning_rate': 4.154744236487485e-06}[Rank 3] Trainer log: {'loss': 0.7356, 'grad_norm': 3.876523494720459, 'learning_rate': 4.154744236487485e-06} [Rank 0] Trainer log: {'loss': 0.7356, 'grad_norm': 3.876523494720459, 'learning_rate': 4.154744236487485e-06} {'loss': 0.7356, 'grad_norm': 3.876523494720459, 'learning_rate': 4.154744236487485e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7697, 'grad_norm': 4.575761795043945, 'learning_rate': 4.1492126881569225e-06}[Rank 3] Trainer log: {'loss': 0.7697, 'grad_norm': 4.575761795043945, 'learning_rate': 4.1492126881569225e-06} [Rank 1] Trainer log: {'loss': 0.7697, 'grad_norm': 4.575761795043945, 'learning_rate': 4.1492126881569225e-06} [Rank 0] Trainer log: {'loss': 0.7697, 'grad_norm': 4.575761795043945, 'learning_rate': 4.1492126881569225e-06} {'loss': 0.7697, 'grad_norm': 4.575761795043945, 'learning_rate': 4.1492126881569225e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.9979, 'grad_norm': 3.1232149600982666, 'learning_rate': 4.143683860507302e-06} [Rank 0] Trainer log: {'loss': 0.9979, 'grad_norm': 3.1232149600982666, 'learning_rate': 4.143683860507302e-06}[Rank 3] Trainer log: {'loss': 0.9979, 'grad_norm': 3.1232149600982666, 'learning_rate': 4.143683860507302e-06} [Rank 1] Trainer log: {'loss': 0.9979, 'grad_norm': 3.1232149600982666, 'learning_rate': 4.143683860507302e-06} {'loss': 0.9979, 'grad_norm': 3.1232149600982666, 'learning_rate': 4.143683860507302e-06, 'epoch': 0.71} [Rank 3] Trainer log: {'loss': 0.9656, 'grad_norm': 4.299263954162598, 'learning_rate': 4.138157756109595e-06} [Rank 2] Trainer log: {'loss': 0.9656, 'grad_norm': 4.299263954162598, 'learning_rate': 4.138157756109595e-06} [Rank 1] Trainer log: {'loss': 0.9656, 'grad_norm': 4.299263954162598, 'learning_rate': 4.138157756109595e-06} [Rank 0] Trainer log: {'loss': 0.9656, 'grad_norm': 4.299263954162598, 'learning_rate': 4.138157756109595e-06} {'loss': 0.9656, 'grad_norm': 4.299263954162598, 'learning_rate': 4.138157756109595e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.8899, 'grad_norm': 4.615522861480713, 'learning_rate': 4.1326343775335e-06}[Rank 3] Trainer log: {'loss': 0.8899, 'grad_norm': 4.615522861480713, 'learning_rate': 4.1326343775335e-06} [Rank 0] Trainer log: {'loss': 0.8899, 'grad_norm': 4.615522861480713, 'learning_rate': 4.1326343775335e-06} [Rank 1] Trainer log: {'loss': 0.8899, 'grad_norm': 4.615522861480713, 'learning_rate': 4.1326343775335e-06} {'loss': 0.8899, 'grad_norm': 4.615522861480713, 'learning_rate': 4.1326343775335e-06, 'epoch': 0.71} [Rank 0] Trainer log: {'loss': 0.9763, 'grad_norm': 6.0408830642700195, 'learning_rate': 4.127113727347444e-06}[Rank 2] Trainer log: {'loss': 0.9763, 'grad_norm': 6.0408830642700195, 'learning_rate': 4.127113727347444e-06}[Rank 3] Trainer log: {'loss': 0.9763, 'grad_norm': 6.0408830642700195, 'learning_rate': 4.127113727347444e-06} [Rank 1] Trainer log: {'loss': 0.9763, 'grad_norm': 6.0408830642700195, 'learning_rate': 4.127113727347444e-06} {'loss': 0.9763, 'grad_norm': 6.0408830642700195, 'learning_rate': 4.127113727347444e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.7057, 'grad_norm': 8.267142295837402, 'learning_rate': 4.121595808118598e-06} [Rank 3] Trainer log: {'loss': 0.7057, 'grad_norm': 8.267142295837402, 'learning_rate': 4.121595808118598e-06}[Rank 0] Trainer log: {'loss': 0.7057, 'grad_norm': 8.267142295837402, 'learning_rate': 4.121595808118598e-06} [Rank 1] Trainer log: {'loss': 0.7057, 'grad_norm': 8.267142295837402, 'learning_rate': 4.121595808118598e-06} {'loss': 0.7057, 'grad_norm': 8.267142295837402, 'learning_rate': 4.121595808118598e-06, 'epoch': 0.71} [Rank 2] Trainer log: {'loss': 0.623, 'grad_norm': 11.004233360290527, 'learning_rate': 4.11608062241285e-06} [Rank 1] Trainer log: {'loss': 0.623, 'grad_norm': 11.004233360290527, 'learning_rate': 4.11608062241285e-06} [Rank 0] Trainer log: {'loss': 0.623, 'grad_norm': 11.004233360290527, 'learning_rate': 4.11608062241285e-06}[Rank 3] Trainer log: {'loss': 0.623, 'grad_norm': 11.004233360290527, 'learning_rate': 4.11608062241285e-06} {'loss': 0.623, 'grad_norm': 11.004233360290527, 'learning_rate': 4.11608062241285e-06, 'epoch': 0.72} [Rank 1] Trainer log: {'loss': 0.5381, 'grad_norm': 2.573301076889038, 'learning_rate': 4.110568172794825e-06}[Rank 3] Trainer log: {'loss': 0.5381, 'grad_norm': 2.573301076889038, 'learning_rate': 4.110568172794825e-06} [Rank 2] Trainer log: {'loss': 0.5381, 'grad_norm': 2.573301076889038, 'learning_rate': 4.110568172794825e-06} [Rank 0] Trainer log: {'loss': 0.5381, 'grad_norm': 2.573301076889038, 'learning_rate': 4.110568172794825e-06} {'loss': 0.5381, 'grad_norm': 2.573301076889038, 'learning_rate': 4.110568172794825e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.7906, 'grad_norm': 6.01561975479126, 'learning_rate': 4.1050584618278755e-06}[Rank 3] Trainer log: {'loss': 0.7906, 'grad_norm': 6.01561975479126, 'learning_rate': 4.1050584618278755e-06}[Rank 1] Trainer log: {'loss': 0.7906, 'grad_norm': 6.01561975479126, 'learning_rate': 4.1050584618278755e-06} [Rank 0] Trainer log: {'loss': 0.7906, 'grad_norm': 6.01561975479126, 'learning_rate': 4.1050584618278755e-06} {'loss': 0.7906, 'grad_norm': 6.01561975479126, 'learning_rate': 4.1050584618278755e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 1.0441, 'grad_norm': 2.123535394668579, 'learning_rate': 4.099551492074077e-06}[Rank 0] Trainer log: {'loss': 1.0441, 'grad_norm': 2.123535394668579, 'learning_rate': 4.099551492074077e-06} [Rank 2] Trainer log: {'loss': 1.0441, 'grad_norm': 2.123535394668579, 'learning_rate': 4.099551492074077e-06} [Rank 1] Trainer log: {'loss': 1.0441, 'grad_norm': 2.123535394668579, 'learning_rate': 4.099551492074077e-06} {'loss': 1.0441, 'grad_norm': 2.123535394668579, 'learning_rate': 4.099551492074077e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 0.7576, 'grad_norm': 11.332219123840332, 'learning_rate': 4.094047266094225e-06}[Rank 2] Trainer log: {'loss': 0.7576, 'grad_norm': 11.332219123840332, 'learning_rate': 4.094047266094225e-06}[Rank 3] Trainer log: {'loss': 0.7576, 'grad_norm': 11.332219123840332, 'learning_rate': 4.094047266094225e-06} [Rank 1] Trainer log: {'loss': 0.7576, 'grad_norm': 11.332219123840332, 'learning_rate': 4.094047266094225e-06} {'loss': 0.7576, 'grad_norm': 11.332219123840332, 'learning_rate': 4.094047266094225e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 0.9132, 'grad_norm': 4.64849328994751, 'learning_rate': 4.088545786447855e-06}[Rank 3] Trainer log: {'loss': 0.9132, 'grad_norm': 4.64849328994751, 'learning_rate': 4.088545786447855e-06}[Rank 2] Trainer log: {'loss': 0.9132, 'grad_norm': 4.64849328994751, 'learning_rate': 4.088545786447855e-06} [Rank 1] Trainer log: {'loss': 0.9132, 'grad_norm': 4.64849328994751, 'learning_rate': 4.088545786447855e-06} {'loss': 0.9132, 'grad_norm': 4.64849328994751, 'learning_rate': 4.088545786447855e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 0.73, 'grad_norm': 5.09391975402832, 'learning_rate': 4.0830470556932124e-06}[Rank 1] Trainer log: {'loss': 0.73, 'grad_norm': 5.09391975402832, 'learning_rate': 4.0830470556932124e-06}[Rank 2] Trainer log: {'loss': 0.73, 'grad_norm': 5.09391975402832, 'learning_rate': 4.0830470556932124e-06} [Rank 3] Trainer log: {'loss': 0.73, 'grad_norm': 5.09391975402832, 'learning_rate': 4.0830470556932124e-06} {'loss': 0.73, 'grad_norm': 5.09391975402832, 'learning_rate': 4.0830470556932124e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.7362, 'grad_norm': 8.501914024353027, 'learning_rate': 4.077551076387267e-06}[Rank 3] Trainer log: {'loss': 0.7362, 'grad_norm': 8.501914024353027, 'learning_rate': 4.077551076387267e-06} [Rank 1] Trainer log: {'loss': 0.7362, 'grad_norm': 8.501914024353027, 'learning_rate': 4.077551076387267e-06} [Rank 0] Trainer log: {'loss': 0.7362, 'grad_norm': 8.501914024353027, 'learning_rate': 4.077551076387267e-06} {'loss': 0.7362, 'grad_norm': 8.501914024353027, 'learning_rate': 4.077551076387267e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 0.6186, 'grad_norm': 4.556635856628418, 'learning_rate': 4.072057851085707e-06}[Rank 2] Trainer log: {'loss': 0.6186, 'grad_norm': 4.556635856628418, 'learning_rate': 4.072057851085707e-06} [Rank 0] Trainer log: {'loss': 0.6186, 'grad_norm': 4.556635856628418, 'learning_rate': 4.072057851085707e-06} [Rank 1] Trainer log: {'loss': 0.6186, 'grad_norm': 4.556635856628418, 'learning_rate': 4.072057851085707e-06} {'loss': 0.6186, 'grad_norm': 4.556635856628418, 'learning_rate': 4.072057851085707e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.8979, 'grad_norm': 8.677742004394531, 'learning_rate': 4.0665673823429465e-06}[Rank 3] Trainer log: {'loss': 0.8979, 'grad_norm': 8.677742004394531, 'learning_rate': 4.0665673823429465e-06} [Rank 0] Trainer log: {'loss': 0.8979, 'grad_norm': 8.677742004394531, 'learning_rate': 4.0665673823429465e-06}[Rank 1] Trainer log: {'loss': 0.8979, 'grad_norm': 8.677742004394531, 'learning_rate': 4.0665673823429465e-06} {'loss': 0.8979, 'grad_norm': 8.677742004394531, 'learning_rate': 4.0665673823429465e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.7518, 'grad_norm': 4.541819095611572, 'learning_rate': 4.0610796727121175e-06}[Rank 1] Trainer log: {'loss': 0.7518, 'grad_norm': 4.541819095611572, 'learning_rate': 4.0610796727121175e-06} [Rank 0] Trainer log: {'loss': 0.7518, 'grad_norm': 4.541819095611572, 'learning_rate': 4.0610796727121175e-06} [Rank 3] Trainer log: {'loss': 0.7518, 'grad_norm': 4.541819095611572, 'learning_rate': 4.0610796727121175e-06} {'loss': 0.7518, 'grad_norm': 4.541819095611572, 'learning_rate': 4.0610796727121175e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 0.8967, 'grad_norm': 4.127402305603027, 'learning_rate': 4.055594724745059e-06}[Rank 3] Trainer log: {'loss': 0.8967, 'grad_norm': 4.127402305603027, 'learning_rate': 4.055594724745059e-06}[Rank 2] Trainer log: {'loss': 0.8967, 'grad_norm': 4.127402305603027, 'learning_rate': 4.055594724745059e-06} [Rank 1] Trainer log: {'loss': 0.8967, 'grad_norm': 4.127402305603027, 'learning_rate': 4.055594724745059e-06} {'loss': 0.8967, 'grad_norm': 4.127402305603027, 'learning_rate': 4.055594724745059e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.9002, 'grad_norm': 5.696064472198486, 'learning_rate': 4.05011254099234e-06}[Rank 3] Trainer log: {'loss': 0.9002, 'grad_norm': 5.696064472198486, 'learning_rate': 4.05011254099234e-06} [Rank 1] Trainer log: {'loss': 0.9002, 'grad_norm': 5.696064472198486, 'learning_rate': 4.05011254099234e-06} [Rank 0] Trainer log: {'loss': 0.9002, 'grad_norm': 5.696064472198486, 'learning_rate': 4.05011254099234e-06} {'loss': 0.9002, 'grad_norm': 5.696064472198486, 'learning_rate': 4.05011254099234e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.8441, 'grad_norm': 4.223276138305664, 'learning_rate': 4.044633124003234e-06}[Rank 0] Trainer log: {'loss': 0.8441, 'grad_norm': 4.223276138305664, 'learning_rate': 4.044633124003234e-06} [Rank 3] Trainer log: {'loss': 0.8441, 'grad_norm': 4.223276138305664, 'learning_rate': 4.044633124003234e-06}[Rank 1] Trainer log: {'loss': 0.8441, 'grad_norm': 4.223276138305664, 'learning_rate': 4.044633124003234e-06} {'loss': 0.8441, 'grad_norm': 4.223276138305664, 'learning_rate': 4.044633124003234e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 1.0908, 'grad_norm': 2.3108315467834473, 'learning_rate': 4.039156476325724e-06}[Rank 0] Trainer log: {'loss': 1.0908, 'grad_norm': 2.3108315467834473, 'learning_rate': 4.039156476325724e-06}[Rank 3] Trainer log: {'loss': 1.0908, 'grad_norm': 2.3108315467834473, 'learning_rate': 4.039156476325724e-06} [Rank 1] Trainer log: {'loss': 1.0908, 'grad_norm': 2.3108315467834473, 'learning_rate': 4.039156476325724e-06} {'loss': 1.0908, 'grad_norm': 2.3108315467834473, 'learning_rate': 4.039156476325724e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.5624, 'grad_norm': 17.971092224121094, 'learning_rate': 4.033682600506522e-06}[Rank 1] Trainer log: {'loss': 0.5624, 'grad_norm': 17.971092224121094, 'learning_rate': 4.033682600506522e-06}[Rank 0] Trainer log: {'loss': 0.5624, 'grad_norm': 17.971092224121094, 'learning_rate': 4.033682600506522e-06} [Rank 3] Trainer log: {'loss': 0.5624, 'grad_norm': 17.971092224121094, 'learning_rate': 4.033682600506522e-06} {'loss': 0.5624, 'grad_norm': 17.971092224121094, 'learning_rate': 4.033682600506522e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 0.8169, 'grad_norm': 3.0681138038635254, 'learning_rate': 4.028211499091036e-06}[Rank 3] Trainer log: {'loss': 0.8169, 'grad_norm': 3.0681138038635254, 'learning_rate': 4.028211499091036e-06}[Rank 2] Trainer log: {'loss': 0.8169, 'grad_norm': 3.0681138038635254, 'learning_rate': 4.028211499091036e-06} [Rank 1] Trainer log: {'loss': 0.8169, 'grad_norm': 3.0681138038635254, 'learning_rate': 4.028211499091036e-06} {'loss': 0.8169, 'grad_norm': 3.0681138038635254, 'learning_rate': 4.028211499091036e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 0.9723, 'grad_norm': 2.137728691101074, 'learning_rate': 4.0227431746233855e-06} [Rank 1] Trainer log: {'loss': 0.9723, 'grad_norm': 2.137728691101074, 'learning_rate': 4.0227431746233855e-06}[Rank 0] Trainer log: {'loss': 0.9723, 'grad_norm': 2.137728691101074, 'learning_rate': 4.0227431746233855e-06} [Rank 2] Trainer log: {'loss': 0.9723, 'grad_norm': 2.137728691101074, 'learning_rate': 4.0227431746233855e-06} {'loss': 0.9723, 'grad_norm': 2.137728691101074, 'learning_rate': 4.0227431746233855e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.9506, 'grad_norm': 4.653837203979492, 'learning_rate': 4.017277629646407e-06}[Rank 3] Trainer log: {'loss': 0.9506, 'grad_norm': 4.653837203979492, 'learning_rate': 4.017277629646407e-06} [Rank 1] Trainer log: {'loss': 0.9506, 'grad_norm': 4.653837203979492, 'learning_rate': 4.017277629646407e-06} [Rank 0] Trainer log: {'loss': 0.9506, 'grad_norm': 4.653837203979492, 'learning_rate': 4.017277629646407e-06} {'loss': 0.9506, 'grad_norm': 4.653837203979492, 'learning_rate': 4.017277629646407e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.9366, 'grad_norm': 3.3448636531829834, 'learning_rate': 4.0118148667016395e-06}[Rank 0] Trainer log: {'loss': 0.9366, 'grad_norm': 3.3448636531829834, 'learning_rate': 4.0118148667016395e-06}[Rank 3] Trainer log: {'loss': 0.9366, 'grad_norm': 3.3448636531829834, 'learning_rate': 4.0118148667016395e-06} [Rank 1] Trainer log: {'loss': 0.9366, 'grad_norm': 3.3448636531829834, 'learning_rate': 4.0118148667016395e-06}{'loss': 0.9366, 'grad_norm': 3.3448636531829834, 'learning_rate': 4.0118148667016395e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.8985, 'grad_norm': 9.929661750793457, 'learning_rate': 4.006354888329325e-06}[Rank 3] Trainer log: {'loss': 0.8985, 'grad_norm': 9.929661750793457, 'learning_rate': 4.006354888329325e-06} [Rank 0] Trainer log: {'loss': 0.8985, 'grad_norm': 9.929661750793457, 'learning_rate': 4.006354888329325e-06} [Rank 1] Trainer log: {'loss': 0.8985, 'grad_norm': 9.929661750793457, 'learning_rate': 4.006354888329325e-06} {'loss': 0.8985, 'grad_norm': 9.929661750793457, 'learning_rate': 4.006354888329325e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.6792, 'grad_norm': 7.521327972412109, 'learning_rate': 4.000897697068418e-06}[Rank 3] Trainer log: {'loss': 0.6792, 'grad_norm': 7.521327972412109, 'learning_rate': 4.000897697068418e-06}[Rank 0] Trainer log: {'loss': 0.6792, 'grad_norm': 7.521327972412109, 'learning_rate': 4.000897697068418e-06} [Rank 1] Trainer log: {'loss': 0.6792, 'grad_norm': 7.521327972412109, 'learning_rate': 4.000897697068418e-06} {'loss': 0.6792, 'grad_norm': 7.521327972412109, 'learning_rate': 4.000897697068418e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 0.8196, 'grad_norm': 2.7598965167999268, 'learning_rate': 3.995443295456571e-06}[Rank 2] Trainer log: {'loss': 0.8196, 'grad_norm': 2.7598965167999268, 'learning_rate': 3.995443295456571e-06}[Rank 1] Trainer log: {'loss': 0.8196, 'grad_norm': 2.7598965167999268, 'learning_rate': 3.995443295456571e-06} [Rank 0] Trainer log: {'loss': 0.8196, 'grad_norm': 2.7598965167999268, 'learning_rate': 3.995443295456571e-06} {'loss': 0.8196, 'grad_norm': 2.7598965167999268, 'learning_rate': 3.995443295456571e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 1.0066, 'grad_norm': 7.4459614753723145, 'learning_rate': 3.989991686030139e-06}[Rank 2] Trainer log: {'loss': 1.0066, 'grad_norm': 7.4459614753723145, 'learning_rate': 3.989991686030139e-06} [Rank 1] Trainer log: {'loss': 1.0066, 'grad_norm': 7.4459614753723145, 'learning_rate': 3.989991686030139e-06} [Rank 0] Trainer log: {'loss': 1.0066, 'grad_norm': 7.4459614753723145, 'learning_rate': 3.989991686030139e-06} {'loss': 1.0066, 'grad_norm': 7.4459614753723145, 'learning_rate': 3.989991686030139e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.9401, 'grad_norm': 3.7539548873901367, 'learning_rate': 3.984542871324185e-06}[Rank 0] Trainer log: {'loss': 0.9401, 'grad_norm': 3.7539548873901367, 'learning_rate': 3.984542871324185e-06} [Rank 3] Trainer log: {'loss': 0.9401, 'grad_norm': 3.7539548873901367, 'learning_rate': 3.984542871324185e-06} [Rank 1] Trainer log: {'loss': 0.9401, 'grad_norm': 3.7539548873901367, 'learning_rate': 3.984542871324185e-06} {'loss': 0.9401, 'grad_norm': 3.7539548873901367, 'learning_rate': 3.984542871324185e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.7317, 'grad_norm': 16.043766021728516, 'learning_rate': 3.979096853872466e-06} [Rank 0] Trainer log: {'loss': 0.7317, 'grad_norm': 16.043766021728516, 'learning_rate': 3.979096853872466e-06}[Rank 1] Trainer log: {'loss': 0.7317, 'grad_norm': 16.043766021728516, 'learning_rate': 3.979096853872466e-06}[Rank 3] Trainer log: {'loss': 0.7317, 'grad_norm': 16.043766021728516, 'learning_rate': 3.979096853872466e-06} {'loss': 0.7317, 'grad_norm': 16.043766021728516, 'learning_rate': 3.979096853872466e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.7767, 'grad_norm': 13.078115463256836, 'learning_rate': 3.973653636207437e-06}[Rank 0] Trainer log: {'loss': 0.7767, 'grad_norm': 13.078115463256836, 'learning_rate': 3.973653636207437e-06}[Rank 3] Trainer log: {'loss': 0.7767, 'grad_norm': 13.078115463256836, 'learning_rate': 3.973653636207437e-06} [Rank 1] Trainer log: {'loss': 0.7767, 'grad_norm': 13.078115463256836, 'learning_rate': 3.973653636207437e-06} {'loss': 0.7767, 'grad_norm': 13.078115463256836, 'learning_rate': 3.973653636207437e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 0.6665, 'grad_norm': 9.550402641296387, 'learning_rate': 3.968213220860259e-06}[Rank 1] Trainer log: {'loss': 0.6665, 'grad_norm': 9.550402641296387, 'learning_rate': 3.968213220860259e-06}[Rank 2] Trainer log: {'loss': 0.6665, 'grad_norm': 9.550402641296387, 'learning_rate': 3.968213220860259e-06} [Rank 3] Trainer log: {'loss': 0.6665, 'grad_norm': 9.550402641296387, 'learning_rate': 3.968213220860259e-06} {'loss': 0.6665, 'grad_norm': 9.550402641296387, 'learning_rate': 3.968213220860259e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 0.6169, 'grad_norm': 10.094653129577637, 'learning_rate': 3.9627756103607876e-06} [Rank 2] Trainer log: {'loss': 0.6169, 'grad_norm': 10.094653129577637, 'learning_rate': 3.9627756103607876e-06} [Rank 0] Trainer log: {'loss': 0.6169, 'grad_norm': 10.094653129577637, 'learning_rate': 3.9627756103607876e-06}[Rank 1] Trainer log: {'loss': 0.6169, 'grad_norm': 10.094653129577637, 'learning_rate': 3.9627756103607876e-06} {'loss': 0.6169, 'grad_norm': 10.094653129577637, 'learning_rate': 3.9627756103607876e-06, 'epoch': 0.72} [Rank 1] Trainer log: {'loss': 0.9603, 'grad_norm': 5.695105075836182, 'learning_rate': 3.9573408072375645e-06}[Rank 0] Trainer log: {'loss': 0.9603, 'grad_norm': 5.695105075836182, 'learning_rate': 3.9573408072375645e-06} [Rank 2] Trainer log: {'loss': 0.9603, 'grad_norm': 5.695105075836182, 'learning_rate': 3.9573408072375645e-06} [Rank 3] Trainer log: {'loss': 0.9603, 'grad_norm': 5.695105075836182, 'learning_rate': 3.9573408072375645e-06} {'loss': 0.9603, 'grad_norm': 5.695105075836182, 'learning_rate': 3.9573408072375645e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 1.0526, 'grad_norm': 3.175508499145508, 'learning_rate': 3.951908814017842e-06} [Rank 3] Trainer log: {'loss': 1.0526, 'grad_norm': 3.175508499145508, 'learning_rate': 3.951908814017842e-06} [Rank 0] Trainer log: {'loss': 1.0526, 'grad_norm': 3.175508499145508, 'learning_rate': 3.951908814017842e-06}[Rank 1] Trainer log: {'loss': 1.0526, 'grad_norm': 3.175508499145508, 'learning_rate': 3.951908814017842e-06} {'loss': 1.0526, 'grad_norm': 3.175508499145508, 'learning_rate': 3.951908814017842e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 0.7933, 'grad_norm': 4.312385559082031, 'learning_rate': 3.946479633227552e-06}[Rank 2] Trainer log: {'loss': 0.7933, 'grad_norm': 4.312385559082031, 'learning_rate': 3.946479633227552e-06}[Rank 0] Trainer log: {'loss': 0.7933, 'grad_norm': 4.312385559082031, 'learning_rate': 3.946479633227552e-06} [Rank 1] Trainer log: {'loss': 0.7933, 'grad_norm': 4.312385559082031, 'learning_rate': 3.946479633227552e-06} {'loss': 0.7933, 'grad_norm': 4.312385559082031, 'learning_rate': 3.946479633227552e-06, 'epoch': 0.72} [Rank 1] Trainer log: {'loss': 0.7392, 'grad_norm': 8.809444427490234, 'learning_rate': 3.941053267391322e-06} [Rank 3] Trainer log: {'loss': 0.7392, 'grad_norm': 8.809444427490234, 'learning_rate': 3.941053267391322e-06} [Rank 2] Trainer log: {'loss': 0.7392, 'grad_norm': 8.809444427490234, 'learning_rate': 3.941053267391322e-06} [Rank 0] Trainer log: {'loss': 0.7392, 'grad_norm': 8.809444427490234, 'learning_rate': 3.941053267391322e-06} {'loss': 0.7392, 'grad_norm': 8.809444427490234, 'learning_rate': 3.941053267391322e-06, 'epoch': 0.72} [Rank 1] Trainer log: {'loss': 0.8069, 'grad_norm': 1.8654494285583496, 'learning_rate': 3.935629719032478e-06}[Rank 0] Trainer log: {'loss': 0.8069, 'grad_norm': 1.8654494285583496, 'learning_rate': 3.935629719032478e-06}[Rank 3] Trainer log: {'loss': 0.8069, 'grad_norm': 1.8654494285583496, 'learning_rate': 3.935629719032478e-06} [Rank 2] Trainer log: {'loss': 0.8069, 'grad_norm': 1.8654494285583496, 'learning_rate': 3.935629719032478e-06}{'loss': 0.8069, 'grad_norm': 1.8654494285583496, 'learning_rate': 3.935629719032478e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 0.8197, 'grad_norm': 8.762517929077148, 'learning_rate': 3.930208990673027e-06}[Rank 2] Trainer log: {'loss': 0.8197, 'grad_norm': 8.762517929077148, 'learning_rate': 3.930208990673027e-06}[Rank 3] Trainer log: {'loss': 0.8197, 'grad_norm': 8.762517929077148, 'learning_rate': 3.930208990673027e-06} [Rank 1] Trainer log: {'loss': 0.8197, 'grad_norm': 8.762517929077148, 'learning_rate': 3.930208990673027e-06} {'loss': 0.8197, 'grad_norm': 8.762517929077148, 'learning_rate': 3.930208990673027e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 1.0235, 'grad_norm': 5.785429954528809, 'learning_rate': 3.924791084833664e-06}[Rank 1] Trainer log: {'loss': 1.0235, 'grad_norm': 5.785429954528809, 'learning_rate': 3.924791084833664e-06} [Rank 3] Trainer log: {'loss': 1.0235, 'grad_norm': 5.785429954528809, 'learning_rate': 3.924791084833664e-06}[Rank 2] Trainer log: {'loss': 1.0235, 'grad_norm': 5.785429954528809, 'learning_rate': 3.924791084833664e-06} {'loss': 1.0235, 'grad_norm': 5.785429954528809, 'learning_rate': 3.924791084833664e-06, 'epoch': 0.72} [Rank 1] Trainer log: {'loss': 1.0307, 'grad_norm': 4.218466281890869, 'learning_rate': 3.9193760040337796e-06}[Rank 3] Trainer log: {'loss': 1.0307, 'grad_norm': 4.218466281890869, 'learning_rate': 3.9193760040337796e-06}[Rank 2] Trainer log: {'loss': 1.0307, 'grad_norm': 4.218466281890869, 'learning_rate': 3.9193760040337796e-06} [Rank 0] Trainer log: {'loss': 1.0307, 'grad_norm': 4.218466281890869, 'learning_rate': 3.9193760040337796e-06} {'loss': 1.0307, 'grad_norm': 4.218466281890869, 'learning_rate': 3.9193760040337796e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.8346, 'grad_norm': 19.354171752929688, 'learning_rate': 3.913963750791448e-06} [Rank 1] Trainer log: {'loss': 0.8346, 'grad_norm': 19.354171752929688, 'learning_rate': 3.913963750791448e-06} [Rank 0] Trainer log: {'loss': 0.8346, 'grad_norm': 19.354171752929688, 'learning_rate': 3.913963750791448e-06}[Rank 3] Trainer log: {'loss': 0.8346, 'grad_norm': 19.354171752929688, 'learning_rate': 3.913963750791448e-06} {'loss': 0.8346, 'grad_norm': 19.354171752929688, 'learning_rate': 3.913963750791448e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 0.9261, 'grad_norm': 6.068065643310547, 'learning_rate': 3.908554327623425e-06} [Rank 1] Trainer log: {'loss': 0.9261, 'grad_norm': 6.068065643310547, 'learning_rate': 3.908554327623425e-06} [Rank 0] Trainer log: {'loss': 0.9261, 'grad_norm': 6.068065643310547, 'learning_rate': 3.908554327623425e-06}[Rank 2] Trainer log: {'loss': 0.9261, 'grad_norm': 6.068065643310547, 'learning_rate': 3.908554327623425e-06} {'loss': 0.9261, 'grad_norm': 6.068065643310547, 'learning_rate': 3.908554327623425e-06, 'epoch': 0.72} [Rank 1] Trainer log: {'loss': 0.9147, 'grad_norm': 6.813892364501953, 'learning_rate': 3.90314773704515e-06}[Rank 2] Trainer log: {'loss': 0.9147, 'grad_norm': 6.813892364501953, 'learning_rate': 3.90314773704515e-06}[Rank 3] Trainer log: {'loss': 0.9147, 'grad_norm': 6.813892364501953, 'learning_rate': 3.90314773704515e-06} [Rank 0] Trainer log: {'loss': 0.9147, 'grad_norm': 6.813892364501953, 'learning_rate': 3.90314773704515e-06} {'loss': 0.9147, 'grad_norm': 6.813892364501953, 'learning_rate': 3.90314773704515e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.9744, 'grad_norm': 3.752495765686035, 'learning_rate': 3.897743981570752e-06}[Rank 1] Trainer log: {'loss': 0.9744, 'grad_norm': 3.752495765686035, 'learning_rate': 3.897743981570752e-06} [Rank 3] Trainer log: {'loss': 0.9744, 'grad_norm': 3.752495765686035, 'learning_rate': 3.897743981570752e-06} [Rank 0] Trainer log: {'loss': 0.9744, 'grad_norm': 3.752495765686035, 'learning_rate': 3.897743981570752e-06} {'loss': 0.9744, 'grad_norm': 3.752495765686035, 'learning_rate': 3.897743981570752e-06, 'epoch': 0.72} [Rank 1] Trainer log: {'loss': 0.7119, 'grad_norm': 10.177064895629883, 'learning_rate': 3.892343063713034e-06}[Rank 3] Trainer log: {'loss': 0.7119, 'grad_norm': 10.177064895629883, 'learning_rate': 3.892343063713034e-06}[Rank 2] Trainer log: {'loss': 0.7119, 'grad_norm': 10.177064895629883, 'learning_rate': 3.892343063713034e-06} [Rank 0] Trainer log: {'loss': 0.7119, 'grad_norm': 10.177064895629883, 'learning_rate': 3.892343063713034e-06} {'loss': 0.7119, 'grad_norm': 10.177064895629883, 'learning_rate': 3.892343063713034e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 0.8298, 'grad_norm': 2.8370087146759033, 'learning_rate': 3.886944985983481e-06}[Rank 2] Trainer log: {'loss': 0.8298, 'grad_norm': 2.8370087146759033, 'learning_rate': 3.886944985983481e-06} [Rank 0] Trainer log: {'loss': 0.8298, 'grad_norm': 2.8370087146759033, 'learning_rate': 3.886944985983481e-06}[Rank 1] Trainer log: {'loss': 0.8298, 'grad_norm': 2.8370087146759033, 'learning_rate': 3.886944985983481e-06} {'loss': 0.8298, 'grad_norm': 2.8370087146759033, 'learning_rate': 3.886944985983481e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.9488, 'grad_norm': 3.627634048461914, 'learning_rate': 3.8815497508922606e-06}[Rank 0] Trainer log: {'loss': 0.9488, 'grad_norm': 3.627634048461914, 'learning_rate': 3.8815497508922606e-06}[Rank 3] Trainer log: {'loss': 0.9488, 'grad_norm': 3.627634048461914, 'learning_rate': 3.8815497508922606e-06} [Rank 1] Trainer log: {'loss': 0.9488, 'grad_norm': 3.627634048461914, 'learning_rate': 3.8815497508922606e-06} {'loss': 0.9488, 'grad_norm': 3.627634048461914, 'learning_rate': 3.8815497508922606e-06, 'epoch': 0.72} [Rank 3] Trainer log: {'loss': 0.9546, 'grad_norm': 7.9517903327941895, 'learning_rate': 3.876157360948222e-06} [Rank 0] Trainer log: {'loss': 0.9546, 'grad_norm': 7.9517903327941895, 'learning_rate': 3.876157360948222e-06}[Rank 2] Trainer log: {'loss': 0.9546, 'grad_norm': 7.9517903327941895, 'learning_rate': 3.876157360948222e-06} [Rank 1] Trainer log: {'loss': 0.9546, 'grad_norm': 7.9517903327941895, 'learning_rate': 3.876157360948222e-06} {'loss': 0.9546, 'grad_norm': 7.9517903327941895, 'learning_rate': 3.876157360948222e-06, 'epoch': 0.72} [Rank 0] Trainer log: {'loss': 1.0796, 'grad_norm': 2.348418951034546, 'learning_rate': 3.870767818658878e-06}[Rank 3] Trainer log: {'loss': 1.0796, 'grad_norm': 2.348418951034546, 'learning_rate': 3.870767818658878e-06}[Rank 2] Trainer log: {'loss': 1.0796, 'grad_norm': 2.348418951034546, 'learning_rate': 3.870767818658878e-06} [Rank 1] Trainer log: {'loss': 1.0796, 'grad_norm': 2.348418951034546, 'learning_rate': 3.870767818658878e-06} {'loss': 1.0796, 'grad_norm': 2.348418951034546, 'learning_rate': 3.870767818658878e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.7913, 'grad_norm': 9.005146026611328, 'learning_rate': 3.865381126530433e-06} [Rank 0] Trainer log: {'loss': 0.7913, 'grad_norm': 9.005146026611328, 'learning_rate': 3.865381126530433e-06}[Rank 1] Trainer log: {'loss': 0.7913, 'grad_norm': 9.005146026611328, 'learning_rate': 3.865381126530433e-06} [Rank 3] Trainer log: {'loss': 0.7913, 'grad_norm': 9.005146026611328, 'learning_rate': 3.865381126530433e-06} {'loss': 0.7913, 'grad_norm': 9.005146026611328, 'learning_rate': 3.865381126530433e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 0.6376, 'grad_norm': 24.443086624145508, 'learning_rate': 3.859997287067756e-06} [Rank 0] Trainer log: {'loss': 0.6376, 'grad_norm': 24.443086624145508, 'learning_rate': 3.859997287067756e-06}[Rank 3] Trainer log: {'loss': 0.6376, 'grad_norm': 24.443086624145508, 'learning_rate': 3.859997287067756e-06} [Rank 1] Trainer log: {'loss': 0.6376, 'grad_norm': 24.443086624145508, 'learning_rate': 3.859997287067756e-06} {'loss': 0.6376, 'grad_norm': 24.443086624145508, 'learning_rate': 3.859997287067756e-06, 'epoch': 0.72} [Rank 2] Trainer log: {'loss': 1.018, 'grad_norm': 2.1181252002716064, 'learning_rate': 3.854616302774387e-06}[Rank 0] Trainer log: {'loss': 1.018, 'grad_norm': 2.1181252002716064, 'learning_rate': 3.854616302774387e-06}[Rank 3] Trainer log: {'loss': 1.018, 'grad_norm': 2.1181252002716064, 'learning_rate': 3.854616302774387e-06} [Rank 1] Trainer log: {'loss': 1.018, 'grad_norm': 2.1181252002716064, 'learning_rate': 3.854616302774387e-06} {'loss': 1.018, 'grad_norm': 2.1181252002716064, 'learning_rate': 3.854616302774387e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.9429, 'grad_norm': 1.9275622367858887, 'learning_rate': 3.84923817615255e-06} [Rank 3] Trainer log: {'loss': 0.9429, 'grad_norm': 1.9275622367858887, 'learning_rate': 3.84923817615255e-06}[Rank 0] Trainer log: {'loss': 0.9429, 'grad_norm': 1.9275622367858887, 'learning_rate': 3.84923817615255e-06} [Rank 1] Trainer log: {'loss': 0.9429, 'grad_norm': 1.9275622367858887, 'learning_rate': 3.84923817615255e-06} {'loss': 0.9429, 'grad_norm': 1.9275622367858887, 'learning_rate': 3.84923817615255e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.6806, 'grad_norm': 4.551875114440918, 'learning_rate': 3.843862909703132e-06} [Rank 1] Trainer log: {'loss': 0.6806, 'grad_norm': 4.551875114440918, 'learning_rate': 3.843862909703132e-06} [Rank 2] Trainer log: {'loss': 0.6806, 'grad_norm': 4.551875114440918, 'learning_rate': 3.843862909703132e-06} [Rank 0] Trainer log: {'loss': 0.6806, 'grad_norm': 4.551875114440918, 'learning_rate': 3.843862909703132e-06} {'loss': 0.6806, 'grad_norm': 4.551875114440918, 'learning_rate': 3.843862909703132e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.8866, 'grad_norm': 3.0219669342041016, 'learning_rate': 3.838490505925686e-06}[Rank 0] Trainer log: {'loss': 0.8866, 'grad_norm': 3.0219669342041016, 'learning_rate': 3.838490505925686e-06}[Rank 1] Trainer log: {'loss': 0.8866, 'grad_norm': 3.0219669342041016, 'learning_rate': 3.838490505925686e-06} [Rank 3] Trainer log: {'loss': 0.8866, 'grad_norm': 3.0219669342041016, 'learning_rate': 3.838490505925686e-06} {'loss': 0.8866, 'grad_norm': 3.0219669342041016, 'learning_rate': 3.838490505925686e-06, 'epoch': 0.73} [Rank 0] Trainer log: {'loss': 1.0833, 'grad_norm': 3.1251156330108643, 'learning_rate': 3.833120967318445e-06}[Rank 2] Trainer log: {'loss': 1.0833, 'grad_norm': 3.1251156330108643, 'learning_rate': 3.833120967318445e-06} [Rank 1] Trainer log: {'loss': 1.0833, 'grad_norm': 3.1251156330108643, 'learning_rate': 3.833120967318445e-06} [Rank 3] Trainer log: {'loss': 1.0833, 'grad_norm': 3.1251156330108643, 'learning_rate': 3.833120967318445e-06} {'loss': 1.0833, 'grad_norm': 3.1251156330108643, 'learning_rate': 3.833120967318445e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.8725, 'grad_norm': 5.988575458526611, 'learning_rate': 3.827754296378305e-06}[Rank 0] Trainer log: {'loss': 0.8725, 'grad_norm': 5.988575458526611, 'learning_rate': 3.827754296378305e-06}[Rank 2] Trainer log: {'loss': 0.8725, 'grad_norm': 5.988575458526611, 'learning_rate': 3.827754296378305e-06} [Rank 1] Trainer log: {'loss': 0.8725, 'grad_norm': 5.988575458526611, 'learning_rate': 3.827754296378305e-06} {'loss': 0.8725, 'grad_norm': 5.988575458526611, 'learning_rate': 3.827754296378305e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.7946, 'grad_norm': 3.4341444969177246, 'learning_rate': 3.822390495600821e-06}[Rank 2] Trainer log: {'loss': 0.7946, 'grad_norm': 3.4341444969177246, 'learning_rate': 3.822390495600821e-06} [Rank 1] Trainer log: {'loss': 0.7946, 'grad_norm': 3.4341444969177246, 'learning_rate': 3.822390495600821e-06} [Rank 0] Trainer log: {'loss': 0.7946, 'grad_norm': 3.4341444969177246, 'learning_rate': 3.822390495600821e-06} {'loss': 0.7946, 'grad_norm': 3.4341444969177246, 'learning_rate': 3.822390495600821e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.7586, 'grad_norm': 7.051501274108887, 'learning_rate': 3.817029567480228e-06} [Rank 3] Trainer log: {'loss': 0.7586, 'grad_norm': 7.051501274108887, 'learning_rate': 3.817029567480228e-06} [Rank 0] Trainer log: {'loss': 0.7586, 'grad_norm': 7.051501274108887, 'learning_rate': 3.817029567480228e-06}[Rank 1] Trainer log: {'loss': 0.7586, 'grad_norm': 7.051501274108887, 'learning_rate': 3.817029567480228e-06} {'loss': 0.7586, 'grad_norm': 7.051501274108887, 'learning_rate': 3.817029567480228e-06, 'epoch': 0.73} [Rank 0] Trainer log: {'loss': 0.9355, 'grad_norm': 3.4659993648529053, 'learning_rate': 3.811671514509413e-06}[Rank 2] Trainer log: {'loss': 0.9355, 'grad_norm': 3.4659993648529053, 'learning_rate': 3.811671514509413e-06}[Rank 3] Trainer log: {'loss': 0.9355, 'grad_norm': 3.4659993648529053, 'learning_rate': 3.811671514509413e-06} [Rank 1] Trainer log: {'loss': 0.9355, 'grad_norm': 3.4659993648529053, 'learning_rate': 3.811671514509413e-06} {'loss': 0.9355, 'grad_norm': 3.4659993648529053, 'learning_rate': 3.811671514509413e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.8815, 'grad_norm': 9.521023750305176, 'learning_rate': 3.8063163391799263e-06} [Rank 0] Trainer log: {'loss': 0.8815, 'grad_norm': 9.521023750305176, 'learning_rate': 3.8063163391799263e-06}[Rank 3] Trainer log: {'loss': 0.8815, 'grad_norm': 9.521023750305176, 'learning_rate': 3.8063163391799263e-06} [Rank 1] Trainer log: {'loss': 0.8815, 'grad_norm': 9.521023750305176, 'learning_rate': 3.8063163391799263e-06} {'loss': 0.8815, 'grad_norm': 9.521023750305176, 'learning_rate': 3.8063163391799263e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.6906, 'grad_norm': 8.612207412719727, 'learning_rate': 3.8009640439819905e-06}[Rank 1] Trainer log: {'loss': 0.6906, 'grad_norm': 8.612207412719727, 'learning_rate': 3.8009640439819905e-06}[Rank 2] Trainer log: {'loss': 0.6906, 'grad_norm': 8.612207412719727, 'learning_rate': 3.8009640439819905e-06} [Rank 0] Trainer log: {'loss': 0.6906, 'grad_norm': 8.612207412719727, 'learning_rate': 3.8009640439819905e-06} {'loss': 0.6906, 'grad_norm': 8.612207412719727, 'learning_rate': 3.8009640439819905e-06, 'epoch': 0.73} [Rank 0] Trainer log: {'loss': 0.9848, 'grad_norm': 6.943793296813965, 'learning_rate': 3.7956146314044794e-06}[Rank 2] Trainer log: {'loss': 0.9848, 'grad_norm': 6.943793296813965, 'learning_rate': 3.7956146314044794e-06}[Rank 3] Trainer log: {'loss': 0.9848, 'grad_norm': 6.943793296813965, 'learning_rate': 3.7956146314044794e-06} [Rank 1] Trainer log: {'loss': 0.9848, 'grad_norm': 6.943793296813965, 'learning_rate': 3.7956146314044794e-06} {'loss': 0.9848, 'grad_norm': 6.943793296813965, 'learning_rate': 3.7956146314044794e-06, 'epoch': 0.73} [Rank 0] Trainer log: {'loss': 0.9336, 'grad_norm': 2.9677488803863525, 'learning_rate': 3.7902681039349253e-06}[Rank 2] Trainer log: {'loss': 0.9336, 'grad_norm': 2.9677488803863525, 'learning_rate': 3.7902681039349253e-06}[Rank 3] Trainer log: {'loss': 0.9336, 'grad_norm': 2.9677488803863525, 'learning_rate': 3.7902681039349253e-06} [Rank 1] Trainer log: {'loss': 0.9336, 'grad_norm': 2.9677488803863525, 'learning_rate': 3.7902681039349253e-06} {'loss': 0.9336, 'grad_norm': 2.9677488803863525, 'learning_rate': 3.7902681039349253e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.7133, 'grad_norm': 7.755802631378174, 'learning_rate': 3.7849244640595263e-06}[Rank 1] Trainer log: {'loss': 0.7133, 'grad_norm': 7.755802631378174, 'learning_rate': 3.7849244640595263e-06} [Rank 3] Trainer log: {'loss': 0.7133, 'grad_norm': 7.755802631378174, 'learning_rate': 3.7849244640595263e-06} [Rank 0] Trainer log: {'loss': 0.7133, 'grad_norm': 7.755802631378174, 'learning_rate': 3.7849244640595263e-06} {'loss': 0.7133, 'grad_norm': 7.755802631378174, 'learning_rate': 3.7849244640595263e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.8745, 'grad_norm': 4.057244300842285, 'learning_rate': 3.7795837142631373e-06}[Rank 2] Trainer log: {'loss': 0.8745, 'grad_norm': 4.057244300842285, 'learning_rate': 3.7795837142631373e-06} [Rank 0] Trainer log: {'loss': 0.8745, 'grad_norm': 4.057244300842285, 'learning_rate': 3.7795837142631373e-06} [Rank 1] Trainer log: {'loss': 0.8745, 'grad_norm': 4.057244300842285, 'learning_rate': 3.7795837142631373e-06} {'loss': 0.8745, 'grad_norm': 4.057244300842285, 'learning_rate': 3.7795837142631373e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.6664, 'grad_norm': 2.4173669815063477, 'learning_rate': 3.7742458570292637e-06} [Rank 0] Trainer log: {'loss': 0.6664, 'grad_norm': 2.4173669815063477, 'learning_rate': 3.7742458570292637e-06}[Rank 3] Trainer log: {'loss': 0.6664, 'grad_norm': 2.4173669815063477, 'learning_rate': 3.7742458570292637e-06} [Rank 1] Trainer log: {'loss': 0.6664, 'grad_norm': 2.4173669815063477, 'learning_rate': 3.7742458570292637e-06} {'loss': 0.6664, 'grad_norm': 2.4173669815063477, 'learning_rate': 3.7742458570292637e-06, 'epoch': 0.73} [Rank 0] Trainer log: {'loss': 0.9614, 'grad_norm': 5.431349754333496, 'learning_rate': 3.768910894840063e-06}[Rank 2] Trainer log: {'loss': 0.9614, 'grad_norm': 5.431349754333496, 'learning_rate': 3.768910894840063e-06} [Rank 3] Trainer log: {'loss': 0.9614, 'grad_norm': 5.431349754333496, 'learning_rate': 3.768910894840063e-06}[Rank 1] Trainer log: {'loss': 0.9614, 'grad_norm': 5.431349754333496, 'learning_rate': 3.768910894840063e-06} {'loss': 0.9614, 'grad_norm': 5.431349754333496, 'learning_rate': 3.768910894840063e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.8171, 'grad_norm': 3.959756374359131, 'learning_rate': 3.763578830176361e-06} [Rank 3] Trainer log: {'loss': 0.8171, 'grad_norm': 3.959756374359131, 'learning_rate': 3.763578830176361e-06} [Rank 0] Trainer log: {'loss': 0.8171, 'grad_norm': 3.959756374359131, 'learning_rate': 3.763578830176361e-06}[Rank 1] Trainer log: {'loss': 0.8171, 'grad_norm': 3.959756374359131, 'learning_rate': 3.763578830176361e-06} {'loss': 0.8171, 'grad_norm': 3.959756374359131, 'learning_rate': 3.763578830176361e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.7247, 'grad_norm': 3.985069751739502, 'learning_rate': 3.7582496655176213e-06} [Rank 2] Trainer log: {'loss': 0.7247, 'grad_norm': 3.985069751739502, 'learning_rate': 3.7582496655176213e-06} [Rank 1] Trainer log: {'loss': 0.7247, 'grad_norm': 3.985069751739502, 'learning_rate': 3.7582496655176213e-06}[Rank 0] Trainer log: {'loss': 0.7247, 'grad_norm': 3.985069751739502, 'learning_rate': 3.7582496655176213e-06} {'loss': 0.7247, 'grad_norm': 3.985069751739502, 'learning_rate': 3.7582496655176213e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.8274, 'grad_norm': 4.614591598510742, 'learning_rate': 3.7529234033419616e-06}[Rank 0] Trainer log: {'loss': 0.8274, 'grad_norm': 4.614591598510742, 'learning_rate': 3.7529234033419616e-06}[Rank 3] Trainer log: {'loss': 0.8274, 'grad_norm': 4.614591598510742, 'learning_rate': 3.7529234033419616e-06} [Rank 1] Trainer log: {'loss': 0.8274, 'grad_norm': 4.614591598510742, 'learning_rate': 3.7529234033419616e-06} {'loss': 0.8274, 'grad_norm': 4.614591598510742, 'learning_rate': 3.7529234033419616e-06, 'epoch': 0.73} [Rank 1] Trainer log: {'loss': 0.8193, 'grad_norm': 8.5137939453125, 'learning_rate': 3.7476000461261608e-06}[Rank 3] Trainer log: {'loss': 0.8193, 'grad_norm': 8.5137939453125, 'learning_rate': 3.7476000461261608e-06}[Rank 2] Trainer log: {'loss': 0.8193, 'grad_norm': 8.5137939453125, 'learning_rate': 3.7476000461261608e-06} [Rank 0] Trainer log: {'loss': 0.8193, 'grad_norm': 8.5137939453125, 'learning_rate': 3.7476000461261608e-06} {'loss': 0.8193, 'grad_norm': 8.5137939453125, 'learning_rate': 3.7476000461261608e-06, 'epoch': 0.73} [Rank 0] Trainer log: {'loss': 1.0588, 'grad_norm': 6.638711929321289, 'learning_rate': 3.7422795963456316e-06}[Rank 2] Trainer log: {'loss': 1.0588, 'grad_norm': 6.638711929321289, 'learning_rate': 3.7422795963456316e-06} [Rank 1] Trainer log: {'loss': 1.0588, 'grad_norm': 6.638711929321289, 'learning_rate': 3.7422795963456316e-06}[Rank 3] Trainer log: {'loss': 1.0588, 'grad_norm': 6.638711929321289, 'learning_rate': 3.7422795963456316e-06} {'loss': 1.0588, 'grad_norm': 6.638711929321289, 'learning_rate': 3.7422795963456316e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.7546, 'grad_norm': 19.913745880126953, 'learning_rate': 3.7369620564744447e-06}[Rank 1] Trainer log: {'loss': 0.7546, 'grad_norm': 19.913745880126953, 'learning_rate': 3.7369620564744447e-06}[Rank 0] Trainer log: {'loss': 0.7546, 'grad_norm': 19.913745880126953, 'learning_rate': 3.7369620564744447e-06} [Rank 3] Trainer log: {'loss': 0.7546, 'grad_norm': 19.913745880126953, 'learning_rate': 3.7369620564744447e-06} {'loss': 0.7546, 'grad_norm': 19.913745880126953, 'learning_rate': 3.7369620564744447e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.5741, 'grad_norm': 8.762887954711914, 'learning_rate': 3.7316474289853188e-06} [Rank 0] Trainer log: {'loss': 0.5741, 'grad_norm': 8.762887954711914, 'learning_rate': 3.7316474289853188e-06}[Rank 3] Trainer log: {'loss': 0.5741, 'grad_norm': 8.762887954711914, 'learning_rate': 3.7316474289853188e-06} [Rank 1] Trainer log: {'loss': 0.5741, 'grad_norm': 8.762887954711914, 'learning_rate': 3.7316474289853188e-06} {'loss': 0.5741, 'grad_norm': 8.762887954711914, 'learning_rate': 3.7316474289853188e-06, 'epoch': 0.73} [Rank 1] Trainer log: {'loss': 0.8481, 'grad_norm': 8.476551055908203, 'learning_rate': 3.7263357163496118e-06} [Rank 2] Trainer log: {'loss': 0.8481, 'grad_norm': 8.476551055908203, 'learning_rate': 3.7263357163496118e-06} [Rank 0] Trainer log: {'loss': 0.8481, 'grad_norm': 8.476551055908203, 'learning_rate': 3.7263357163496118e-06}[Rank 3] Trainer log: {'loss': 0.8481, 'grad_norm': 8.476551055908203, 'learning_rate': 3.7263357163496118e-06} {'loss': 0.8481, 'grad_norm': 8.476551055908203, 'learning_rate': 3.7263357163496118e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.7403, 'grad_norm': 2.6959939002990723, 'learning_rate': 3.721026921037325e-06} [Rank 0] Trainer log: {'loss': 0.7403, 'grad_norm': 2.6959939002990723, 'learning_rate': 3.721026921037325e-06}[Rank 3] Trainer log: {'loss': 0.7403, 'grad_norm': 2.6959939002990723, 'learning_rate': 3.721026921037325e-06} [Rank 1] Trainer log: {'loss': 0.7403, 'grad_norm': 2.6959939002990723, 'learning_rate': 3.721026921037325e-06} {'loss': 0.7403, 'grad_norm': 2.6959939002990723, 'learning_rate': 3.721026921037325e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.6388, 'grad_norm': 3.647007703781128, 'learning_rate': 3.7157210455171143e-06}[Rank 1] Trainer log: {'loss': 0.6388, 'grad_norm': 3.647007703781128, 'learning_rate': 3.7157210455171143e-06}[Rank 2] Trainer log: {'loss': 0.6388, 'grad_norm': 3.647007703781128, 'learning_rate': 3.7157210455171143e-06} [Rank 0] Trainer log: {'loss': 0.6388, 'grad_norm': 3.647007703781128, 'learning_rate': 3.7157210455171143e-06} {'loss': 0.6388, 'grad_norm': 3.647007703781128, 'learning_rate': 3.7157210455171143e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.9265, 'grad_norm': 3.0168631076812744, 'learning_rate': 3.7104180922562673e-06}[Rank 3] Trainer log: {'loss': 0.9265, 'grad_norm': 3.0168631076812744, 'learning_rate': 3.7104180922562673e-06}[Rank 0] Trainer log: {'loss': 0.9265, 'grad_norm': 3.0168631076812744, 'learning_rate': 3.7104180922562673e-06} [Rank 1] Trainer log: {'loss': 0.9265, 'grad_norm': 3.0168631076812744, 'learning_rate': 3.7104180922562673e-06} {'loss': 0.9265, 'grad_norm': 3.0168631076812744, 'learning_rate': 3.7104180922562673e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.8338, 'grad_norm': 3.7211287021636963, 'learning_rate': 3.705118063720712e-06} [Rank 2] Trainer log: {'loss': 0.8338, 'grad_norm': 3.7211287021636963, 'learning_rate': 3.705118063720712e-06} [Rank 0] Trainer log: {'loss': 0.8338, 'grad_norm': 3.7211287021636963, 'learning_rate': 3.705118063720712e-06} [Rank 1] Trainer log: {'loss': 0.8338, 'grad_norm': 3.7211287021636963, 'learning_rate': 3.705118063720712e-06} {'loss': 0.8338, 'grad_norm': 3.7211287021636963, 'learning_rate': 3.705118063720712e-06, 'epoch': 0.73} [Rank 0] Trainer log: {'loss': 0.6082, 'grad_norm': 11.747123718261719, 'learning_rate': 3.6998209623750282e-06}[Rank 3] Trainer log: {'loss': 0.6082, 'grad_norm': 11.747123718261719, 'learning_rate': 3.6998209623750282e-06}[Rank 2] Trainer log: {'loss': 0.6082, 'grad_norm': 11.747123718261719, 'learning_rate': 3.6998209623750282e-06} [Rank 1] Trainer log: {'loss': 0.6082, 'grad_norm': 11.747123718261719, 'learning_rate': 3.6998209623750282e-06} {'loss': 0.6082, 'grad_norm': 11.747123718261719, 'learning_rate': 3.6998209623750282e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.7257, 'grad_norm': 4.8666791915893555, 'learning_rate': 3.694526790682418e-06}[Rank 2] Trainer log: {'loss': 0.7257, 'grad_norm': 4.8666791915893555, 'learning_rate': 3.694526790682418e-06}[Rank 0] Trainer log: {'loss': 0.7257, 'grad_norm': 4.8666791915893555, 'learning_rate': 3.694526790682418e-06} [Rank 1] Trainer log: {'loss': 0.7257, 'grad_norm': 4.8666791915893555, 'learning_rate': 3.694526790682418e-06} {'loss': 0.7257, 'grad_norm': 4.8666791915893555, 'learning_rate': 3.694526790682418e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.7303, 'grad_norm': 8.852468490600586, 'learning_rate': 3.6892355511047353e-06}[Rank 2] Trainer log: {'loss': 0.7303, 'grad_norm': 8.852468490600586, 'learning_rate': 3.6892355511047353e-06}[Rank 0] Trainer log: {'loss': 0.7303, 'grad_norm': 8.852468490600586, 'learning_rate': 3.6892355511047353e-06} {'loss': 0.7303, 'grad_norm': 8.852468490600586, 'learning_rate': 3.6892355511047353e-06, 'epoch': 0.73} [Rank 1] Trainer log: {'loss': 0.7303, 'grad_norm': 8.852468490600586, 'learning_rate': 3.6892355511047353e-06} [Rank 0] Trainer log: {'loss': 0.8885, 'grad_norm': 6.204442501068115, 'learning_rate': 3.683947246102468e-06}[Rank 2] Trainer log: {'loss': 0.8885, 'grad_norm': 6.204442501068115, 'learning_rate': 3.683947246102468e-06} [Rank 3] Trainer log: {'loss': 0.8885, 'grad_norm': 6.204442501068115, 'learning_rate': 3.683947246102468e-06} [Rank 1] Trainer log: {'loss': 0.8885, 'grad_norm': 6.204442501068115, 'learning_rate': 3.683947246102468e-06} {'loss': 0.8885, 'grad_norm': 6.204442501068115, 'learning_rate': 3.683947246102468e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.8415, 'grad_norm': 7.421847343444824, 'learning_rate': 3.678661878134733e-06}[Rank 1] Trainer log: {'loss': 0.8415, 'grad_norm': 7.421847343444824, 'learning_rate': 3.678661878134733e-06} [Rank 0] Trainer log: {'loss': 0.8415, 'grad_norm': 7.421847343444824, 'learning_rate': 3.678661878134733e-06} [Rank 3] Trainer log: {'loss': 0.8415, 'grad_norm': 7.421847343444824, 'learning_rate': 3.678661878134733e-06} {'loss': 0.8415, 'grad_norm': 7.421847343444824, 'learning_rate': 3.678661878134733e-06, 'epoch': 0.73} [Rank 1] Trainer log: {'loss': 0.9232, 'grad_norm': 8.416939735412598, 'learning_rate': 3.673379449659281e-06} [Rank 2] Trainer log: {'loss': 0.9232, 'grad_norm': 8.416939735412598, 'learning_rate': 3.673379449659281e-06} [Rank 0] Trainer log: {'loss': 0.9232, 'grad_norm': 8.416939735412598, 'learning_rate': 3.673379449659281e-06}[Rank 3] Trainer log: {'loss': 0.9232, 'grad_norm': 8.416939735412598, 'learning_rate': 3.673379449659281e-06} {'loss': 0.9232, 'grad_norm': 8.416939735412598, 'learning_rate': 3.673379449659281e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.7093, 'grad_norm': 16.59530258178711, 'learning_rate': 3.6680999631325078e-06}[Rank 1] Trainer log: {'loss': 0.7093, 'grad_norm': 16.59530258178711, 'learning_rate': 3.6680999631325078e-06} [Rank 3] Trainer log: {'loss': 0.7093, 'grad_norm': 16.59530258178711, 'learning_rate': 3.6680999631325078e-06} [Rank 0] Trainer log: {'loss': 0.7093, 'grad_norm': 16.59530258178711, 'learning_rate': 3.6680999631325078e-06} {'loss': 0.7093, 'grad_norm': 16.59530258178711, 'learning_rate': 3.6680999631325078e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.9716, 'grad_norm': 6.134431838989258, 'learning_rate': 3.6628234210094292e-06} [Rank 0] Trainer log: {'loss': 0.9716, 'grad_norm': 6.134431838989258, 'learning_rate': 3.6628234210094292e-06}[Rank 3] Trainer log: {'loss': 0.9716, 'grad_norm': 6.134431838989258, 'learning_rate': 3.6628234210094292e-06} [Rank 1] Trainer log: {'loss': 0.9716, 'grad_norm': 6.134431838989258, 'learning_rate': 3.6628234210094292e-06} {'loss': 0.9716, 'grad_norm': 6.134431838989258, 'learning_rate': 3.6628234210094292e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.5728, 'grad_norm': 1.6645231246948242, 'learning_rate': 3.6575498257436915e-06} [Rank 3] Trainer log: {'loss': 0.5728, 'grad_norm': 1.6645231246948242, 'learning_rate': 3.6575498257436915e-06}[Rank 0] Trainer log: {'loss': 0.5728, 'grad_norm': 1.6645231246948242, 'learning_rate': 3.6575498257436915e-06} [Rank 1] Trainer log: {'loss': 0.5728, 'grad_norm': 1.6645231246948242, 'learning_rate': 3.6575498257436915e-06} {'loss': 0.5728, 'grad_norm': 1.6645231246948242, 'learning_rate': 3.6575498257436915e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 1.0256, 'grad_norm': 3.8568973541259766, 'learning_rate': 3.6522791797875832e-06} [Rank 0] Trainer log: {'loss': 1.0256, 'grad_norm': 3.8568973541259766, 'learning_rate': 3.6522791797875832e-06}[Rank 3] Trainer log: {'loss': 1.0256, 'grad_norm': 3.8568973541259766, 'learning_rate': 3.6522791797875832e-06} [Rank 1] Trainer log: {'loss': 1.0256, 'grad_norm': 3.8568973541259766, 'learning_rate': 3.6522791797875832e-06} {'loss': 1.0256, 'grad_norm': 3.8568973541259766, 'learning_rate': 3.6522791797875832e-06, 'epoch': 0.73} [Rank 0] Trainer log: {'loss': 0.7469, 'grad_norm': 6.0747880935668945, 'learning_rate': 3.6470114855920058e-06}[Rank 1] Trainer log: {'loss': 0.7469, 'grad_norm': 6.0747880935668945, 'learning_rate': 3.6470114855920058e-06} [Rank 2] Trainer log: {'loss': 0.7469, 'grad_norm': 6.0747880935668945, 'learning_rate': 3.6470114855920058e-06} [Rank 3] Trainer log: {'loss': 0.7469, 'grad_norm': 6.0747880935668945, 'learning_rate': 3.6470114855920058e-06} {'loss': 0.7469, 'grad_norm': 6.0747880935668945, 'learning_rate': 3.6470114855920058e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.8421, 'grad_norm': 10.826337814331055, 'learning_rate': 3.6417467456065013e-06}[Rank 1] Trainer log: {'loss': 0.8421, 'grad_norm': 10.826337814331055, 'learning_rate': 3.6417467456065013e-06}[Rank 2] Trainer log: {'loss': 0.8421, 'grad_norm': 10.826337814331055, 'learning_rate': 3.6417467456065013e-06} [Rank 0] Trainer log: {'loss': 0.8421, 'grad_norm': 10.826337814331055, 'learning_rate': 3.6417467456065013e-06} {'loss': 0.8421, 'grad_norm': 10.826337814331055, 'learning_rate': 3.6417467456065013e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.939, 'grad_norm': 7.7394514083862305, 'learning_rate': 3.6364849622792262e-06} [Rank 0] Trainer log: {'loss': 0.939, 'grad_norm': 7.7394514083862305, 'learning_rate': 3.6364849622792262e-06}[Rank 1] Trainer log: {'loss': 0.939, 'grad_norm': 7.7394514083862305, 'learning_rate': 3.6364849622792262e-06} [Rank 3] Trainer log: {'loss': 0.939, 'grad_norm': 7.7394514083862305, 'learning_rate': 3.6364849622792262e-06} {'loss': 0.939, 'grad_norm': 7.7394514083862305, 'learning_rate': 3.6364849622792262e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.7291, 'grad_norm': 4.3962836265563965, 'learning_rate': 3.631226138056976e-06} [Rank 3] Trainer log: {'loss': 0.7291, 'grad_norm': 4.3962836265563965, 'learning_rate': 3.631226138056976e-06} [Rank 1] Trainer log: {'loss': 0.7291, 'grad_norm': 4.3962836265563965, 'learning_rate': 3.631226138056976e-06} [Rank 0] Trainer log: {'loss': 0.7291, 'grad_norm': 4.3962836265563965, 'learning_rate': 3.631226138056976e-06} {'loss': 0.7291, 'grad_norm': 4.3962836265563965, 'learning_rate': 3.631226138056976e-06, 'epoch': 0.73} [Rank 1] Trainer log: {'loss': 0.803, 'grad_norm': 3.428884744644165, 'learning_rate': 3.625970275385157e-06}[Rank 2] Trainer log: {'loss': 0.803, 'grad_norm': 3.428884744644165, 'learning_rate': 3.625970275385157e-06}[Rank 0] Trainer log: {'loss': 0.803, 'grad_norm': 3.428884744644165, 'learning_rate': 3.625970275385157e-06} [Rank 3] Trainer log: {'loss': 0.803, 'grad_norm': 3.428884744644165, 'learning_rate': 3.625970275385157e-06} {'loss': 0.803, 'grad_norm': 3.428884744644165, 'learning_rate': 3.625970275385157e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.8129, 'grad_norm': 5.628931522369385, 'learning_rate': 3.6207173767077996e-06}[Rank 0] Trainer log: {'loss': 0.8129, 'grad_norm': 5.628931522369385, 'learning_rate': 3.6207173767077996e-06} [Rank 3] Trainer log: {'loss': 0.8129, 'grad_norm': 5.628931522369385, 'learning_rate': 3.6207173767077996e-06} [Rank 1] Trainer log: {'loss': 0.8129, 'grad_norm': 5.628931522369385, 'learning_rate': 3.6207173767077996e-06} {'loss': 0.8129, 'grad_norm': 5.628931522369385, 'learning_rate': 3.6207173767077996e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 1.0201, 'grad_norm': 4.106201648712158, 'learning_rate': 3.6154674444675685e-06} [Rank 3] Trainer log: {'loss': 1.0201, 'grad_norm': 4.106201648712158, 'learning_rate': 3.6154674444675685e-06}[Rank 0] Trainer log: {'loss': 1.0201, 'grad_norm': 4.106201648712158, 'learning_rate': 3.6154674444675685e-06} [Rank 1] Trainer log: {'loss': 1.0201, 'grad_norm': 4.106201648712158, 'learning_rate': 3.6154674444675685e-06} {'loss': 1.0201, 'grad_norm': 4.106201648712158, 'learning_rate': 3.6154674444675685e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.78, 'grad_norm': 2.2719357013702393, 'learning_rate': 3.6102204811057375e-06}[Rank 2] Trainer log: {'loss': 0.78, 'grad_norm': 2.2719357013702393, 'learning_rate': 3.6102204811057375e-06} [Rank 0] Trainer log: {'loss': 0.78, 'grad_norm': 2.2719357013702393, 'learning_rate': 3.6102204811057375e-06}[Rank 1] Trainer log: {'loss': 0.78, 'grad_norm': 2.2719357013702393, 'learning_rate': 3.6102204811057375e-06} {'loss': 0.78, 'grad_norm': 2.2719357013702393, 'learning_rate': 3.6102204811057375e-06, 'epoch': 0.73} [Rank 3] Trainer log: {'loss': 0.7438, 'grad_norm': 4.827491283416748, 'learning_rate': 3.6049764890621976e-06}[Rank 0] Trainer log: {'loss': 0.7438, 'grad_norm': 4.827491283416748, 'learning_rate': 3.6049764890621976e-06}[Rank 2] Trainer log: {'loss': 0.7438, 'grad_norm': 4.827491283416748, 'learning_rate': 3.6049764890621976e-06} [Rank 1] Trainer log: {'loss': 0.7438, 'grad_norm': 4.827491283416748, 'learning_rate': 3.6049764890621976e-06} {'loss': 0.7438, 'grad_norm': 4.827491283416748, 'learning_rate': 3.6049764890621976e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 0.7111, 'grad_norm': 3.2240262031555176, 'learning_rate': 3.5997354707754673e-06}[Rank 1] Trainer log: {'loss': 0.7111, 'grad_norm': 3.2240262031555176, 'learning_rate': 3.5997354707754673e-06}[Rank 0] Trainer log: {'loss': 0.7111, 'grad_norm': 3.2240262031555176, 'learning_rate': 3.5997354707754673e-06} [Rank 3] Trainer log: {'loss': 0.7111, 'grad_norm': 3.2240262031555176, 'learning_rate': 3.5997354707754673e-06} {'loss': 0.7111, 'grad_norm': 3.2240262031555176, 'learning_rate': 3.5997354707754673e-06, 'epoch': 0.73} [Rank 2] Trainer log: {'loss': 1.0249, 'grad_norm': 1.9628269672393799, 'learning_rate': 3.5944974286826815e-06} [Rank 0] Trainer log: {'loss': 1.0249, 'grad_norm': 1.9628269672393799, 'learning_rate': 3.5944974286826815e-06}[Rank 3] Trainer log: {'loss': 1.0249, 'grad_norm': 1.9628269672393799, 'learning_rate': 3.5944974286826815e-06} [Rank 1] Trainer log: {'loss': 1.0249, 'grad_norm': 1.9628269672393799, 'learning_rate': 3.5944974286826815e-06} {'loss': 1.0249, 'grad_norm': 1.9628269672393799, 'learning_rate': 3.5944974286826815e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.7778, 'grad_norm': 3.581693649291992, 'learning_rate': 3.5892623652195813e-06} [Rank 3] Trainer log: {'loss': 0.7778, 'grad_norm': 3.581693649291992, 'learning_rate': 3.5892623652195813e-06}[Rank 0] Trainer log: {'loss': 0.7778, 'grad_norm': 3.581693649291992, 'learning_rate': 3.5892623652195813e-06} [Rank 1] Trainer log: {'loss': 0.7778, 'grad_norm': 3.581693649291992, 'learning_rate': 3.5892623652195813e-06} {'loss': 0.7778, 'grad_norm': 3.581693649291992, 'learning_rate': 3.5892623652195813e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.8131, 'grad_norm': 16.150217056274414, 'learning_rate': 3.5840302828205355e-06}[Rank 1] Trainer log: {'loss': 0.8131, 'grad_norm': 16.150217056274414, 'learning_rate': 3.5840302828205355e-06} [Rank 2] Trainer log: {'loss': 0.8131, 'grad_norm': 16.150217056274414, 'learning_rate': 3.5840302828205355e-06} [Rank 0] Trainer log: {'loss': 0.8131, 'grad_norm': 16.150217056274414, 'learning_rate': 3.5840302828205355e-06} {'loss': 0.8131, 'grad_norm': 16.150217056274414, 'learning_rate': 3.5840302828205355e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.9248, 'grad_norm': 3.9549012184143066, 'learning_rate': 3.5788011839185167e-06}[Rank 1] Trainer log: {'loss': 0.9248, 'grad_norm': 3.9549012184143066, 'learning_rate': 3.5788011839185167e-06} [Rank 0] Trainer log: {'loss': 0.9248, 'grad_norm': 3.9549012184143066, 'learning_rate': 3.5788011839185167e-06}[Rank 2] Trainer log: {'loss': 0.9248, 'grad_norm': 3.9549012184143066, 'learning_rate': 3.5788011839185167e-06} {'loss': 0.9248, 'grad_norm': 3.9549012184143066, 'learning_rate': 3.5788011839185167e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.9066, 'grad_norm': 7.331901550292969, 'learning_rate': 3.573575070945112e-06}[Rank 1] Trainer log: {'loss': 0.9066, 'grad_norm': 7.331901550292969, 'learning_rate': 3.573575070945112e-06} [Rank 0] Trainer log: {'loss': 0.9066, 'grad_norm': 7.331901550292969, 'learning_rate': 3.573575070945112e-06} [Rank 3] Trainer log: {'loss': 0.9066, 'grad_norm': 7.331901550292969, 'learning_rate': 3.573575070945112e-06} {'loss': 0.9066, 'grad_norm': 7.331901550292969, 'learning_rate': 3.573575070945112e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8882, 'grad_norm': 2.9893574714660645, 'learning_rate': 3.568351946330527e-06}[Rank 3] Trainer log: {'loss': 0.8882, 'grad_norm': 2.9893574714660645, 'learning_rate': 3.568351946330527e-06} [Rank 0] Trainer log: {'loss': 0.8882, 'grad_norm': 2.9893574714660645, 'learning_rate': 3.568351946330527e-06} [Rank 1] Trainer log: {'loss': 0.8882, 'grad_norm': 2.9893574714660645, 'learning_rate': 3.568351946330527e-06} {'loss': 0.8882, 'grad_norm': 2.9893574714660645, 'learning_rate': 3.568351946330527e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.6923, 'grad_norm': 6.032876491546631, 'learning_rate': 3.563131812503571e-06}[Rank 1] Trainer log: {'loss': 0.6923, 'grad_norm': 6.032876491546631, 'learning_rate': 3.563131812503571e-06} [Rank 3] Trainer log: {'loss': 0.6923, 'grad_norm': 6.032876491546631, 'learning_rate': 3.563131812503571e-06} [Rank 0] Trainer log: {'loss': 0.6923, 'grad_norm': 6.032876491546631, 'learning_rate': 3.563131812503571e-06} {'loss': 0.6923, 'grad_norm': 6.032876491546631, 'learning_rate': 3.563131812503571e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8334, 'grad_norm': 7.007505893707275, 'learning_rate': 3.557914671891659e-06} [Rank 0] Trainer log: {'loss': 0.8334, 'grad_norm': 7.007505893707275, 'learning_rate': 3.557914671891659e-06}[Rank 3] Trainer log: {'loss': 0.8334, 'grad_norm': 7.007505893707275, 'learning_rate': 3.557914671891659e-06} [Rank 1] Trainer log: {'loss': 0.8334, 'grad_norm': 7.007505893707275, 'learning_rate': 3.557914671891659e-06} {'loss': 0.8334, 'grad_norm': 7.007505893707275, 'learning_rate': 3.557914671891659e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.7755, 'grad_norm': 4.657846450805664, 'learning_rate': 3.5527005269208225e-06}[Rank 1] Trainer log: {'loss': 0.7755, 'grad_norm': 4.657846450805664, 'learning_rate': 3.5527005269208225e-06} [Rank 3] Trainer log: {'loss': 0.7755, 'grad_norm': 4.657846450805664, 'learning_rate': 3.5527005269208225e-06} [Rank 0] Trainer log: {'loss': 0.7755, 'grad_norm': 4.657846450805664, 'learning_rate': 3.5527005269208225e-06} {'loss': 0.7755, 'grad_norm': 4.657846450805664, 'learning_rate': 3.5527005269208225e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.7808, 'grad_norm': 2.5688247680664062, 'learning_rate': 3.5474893800157005e-06}[Rank 3] Trainer log: {'loss': 0.7808, 'grad_norm': 2.5688247680664062, 'learning_rate': 3.5474893800157005e-06}[Rank 1] Trainer log: {'loss': 0.7808, 'grad_norm': 2.5688247680664062, 'learning_rate': 3.5474893800157005e-06} [Rank 0] Trainer log: {'loss': 0.7808, 'grad_norm': 2.5688247680664062, 'learning_rate': 3.5474893800157005e-06} {'loss': 0.7808, 'grad_norm': 2.5688247680664062, 'learning_rate': 3.5474893800157005e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.7817, 'grad_norm': 6.052860260009766, 'learning_rate': 3.5422812335995294e-06} [Rank 3] Trainer log: {'loss': 0.7817, 'grad_norm': 6.052860260009766, 'learning_rate': 3.5422812335995294e-06} [Rank 0] Trainer log: {'loss': 0.7817, 'grad_norm': 6.052860260009766, 'learning_rate': 3.5422812335995294e-06} [Rank 1] Trainer log: {'loss': 0.7817, 'grad_norm': 6.052860260009766, 'learning_rate': 3.5422812335995294e-06} {'loss': 0.7817, 'grad_norm': 6.052860260009766, 'learning_rate': 3.5422812335995294e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.8195, 'grad_norm': 2.681206464767456, 'learning_rate': 3.537076090094158e-06}[Rank 2] Trainer log: {'loss': 0.8195, 'grad_norm': 2.681206464767456, 'learning_rate': 3.537076090094158e-06}[Rank 1] Trainer log: {'loss': 0.8195, 'grad_norm': 2.681206464767456, 'learning_rate': 3.537076090094158e-06} [Rank 0] Trainer log: {'loss': 0.8195, 'grad_norm': 2.681206464767456, 'learning_rate': 3.537076090094158e-06} {'loss': 0.8195, 'grad_norm': 2.681206464767456, 'learning_rate': 3.537076090094158e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8807, 'grad_norm': 10.593487739562988, 'learning_rate': 3.531873951920035e-06} [Rank 1] Trainer log: {'loss': 0.8807, 'grad_norm': 10.593487739562988, 'learning_rate': 3.531873951920035e-06}[Rank 0] Trainer log: {'loss': 0.8807, 'grad_norm': 10.593487739562988, 'learning_rate': 3.531873951920035e-06} [Rank 3] Trainer log: {'loss': 0.8807, 'grad_norm': 10.593487739562988, 'learning_rate': 3.531873951920035e-06} {'loss': 0.8807, 'grad_norm': 10.593487739562988, 'learning_rate': 3.531873951920035e-06, 'epoch': 0.74} [Rank 0] Trainer log: {'loss': 0.5768, 'grad_norm': 2.5967488288879395, 'learning_rate': 3.5266748214962077e-06}[Rank 3] Trainer log: {'loss': 0.5768, 'grad_norm': 2.5967488288879395, 'learning_rate': 3.5266748214962077e-06} [Rank 1] Trainer log: {'loss': 0.5768, 'grad_norm': 2.5967488288879395, 'learning_rate': 3.5266748214962077e-06}[Rank 2] Trainer log: {'loss': 0.5768, 'grad_norm': 2.5967488288879395, 'learning_rate': 3.5266748214962077e-06} {'loss': 0.5768, 'grad_norm': 2.5967488288879395, 'learning_rate': 3.5266748214962077e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.9099, 'grad_norm': 3.677097797393799, 'learning_rate': 3.5214787012403373e-06} [Rank 1] Trainer log: {'loss': 0.9099, 'grad_norm': 3.677097797393799, 'learning_rate': 3.5214787012403373e-06}[Rank 3] Trainer log: {'loss': 0.9099, 'grad_norm': 3.677097797393799, 'learning_rate': 3.5214787012403373e-06} [Rank 0] Trainer log: {'loss': 0.9099, 'grad_norm': 3.677097797393799, 'learning_rate': 3.5214787012403373e-06} {'loss': 0.9099, 'grad_norm': 3.677097797393799, 'learning_rate': 3.5214787012403373e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.7663, 'grad_norm': 6.067300796508789, 'learning_rate': 3.5162855935686723e-06}[Rank 0] Trainer log: {'loss': 0.7663, 'grad_norm': 6.067300796508789, 'learning_rate': 3.5162855935686723e-06}[Rank 3] Trainer log: {'loss': 0.7663, 'grad_norm': 6.067300796508789, 'learning_rate': 3.5162855935686723e-06} [Rank 1] Trainer log: {'loss': 0.7663, 'grad_norm': 6.067300796508789, 'learning_rate': 3.5162855935686723e-06} {'loss': 0.7663, 'grad_norm': 6.067300796508789, 'learning_rate': 3.5162855935686723e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.778, 'grad_norm': 3.217203378677368, 'learning_rate': 3.5110955008960634e-06}[Rank 2] Trainer log: {'loss': 0.778, 'grad_norm': 3.217203378677368, 'learning_rate': 3.5110955008960634e-06} [Rank 1] Trainer log: {'loss': 0.778, 'grad_norm': 3.217203378677368, 'learning_rate': 3.5110955008960634e-06} [Rank 0] Trainer log: {'loss': 0.778, 'grad_norm': 3.217203378677368, 'learning_rate': 3.5110955008960634e-06} {'loss': 0.778, 'grad_norm': 3.217203378677368, 'learning_rate': 3.5110955008960634e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.885, 'grad_norm': 4.13922119140625, 'learning_rate': 3.5059084256359643e-06} [Rank 2] Trainer log: {'loss': 0.885, 'grad_norm': 4.13922119140625, 'learning_rate': 3.5059084256359643e-06}[Rank 0] Trainer log: {'loss': 0.885, 'grad_norm': 4.13922119140625, 'learning_rate': 3.5059084256359643e-06} {'loss': 0.885, 'grad_norm': 4.13922119140625, 'learning_rate': 3.5059084256359643e-06, 'epoch': 0.74}[Rank 1] Trainer log: {'loss': 0.885, 'grad_norm': 4.13922119140625, 'learning_rate': 3.5059084256359643e-06} [Rank 2] Trainer log: {'loss': 0.8575, 'grad_norm': 9.786083221435547, 'learning_rate': 3.5007243702004257e-06}[Rank 3] Trainer log: {'loss': 0.8575, 'grad_norm': 9.786083221435547, 'learning_rate': 3.5007243702004257e-06}[Rank 0] Trainer log: {'loss': 0.8575, 'grad_norm': 9.786083221435547, 'learning_rate': 3.5007243702004257e-06} [Rank 1] Trainer log: {'loss': 0.8575, 'grad_norm': 9.786083221435547, 'learning_rate': 3.5007243702004257e-06} {'loss': 0.8575, 'grad_norm': 9.786083221435547, 'learning_rate': 3.5007243702004257e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.7825, 'grad_norm': 3.6364986896514893, 'learning_rate': 3.495543337000087e-06} [Rank 2] Trainer log: {'loss': 0.7825, 'grad_norm': 3.6364986896514893, 'learning_rate': 3.495543337000087e-06}[Rank 0] Trainer log: {'loss': 0.7825, 'grad_norm': 3.6364986896514893, 'learning_rate': 3.495543337000087e-06} [Rank 1] Trainer log: {'loss': 0.7825, 'grad_norm': 3.6364986896514893, 'learning_rate': 3.495543337000087e-06} {'loss': 0.7825, 'grad_norm': 3.6364986896514893, 'learning_rate': 3.495543337000087e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.9105, 'grad_norm': 2.3669135570526123, 'learning_rate': 3.490365328444183e-06}[Rank 3] Trainer log: {'loss': 0.9105, 'grad_norm': 2.3669135570526123, 'learning_rate': 3.490365328444183e-06} [Rank 0] Trainer log: {'loss': 0.9105, 'grad_norm': 2.3669135570526123, 'learning_rate': 3.490365328444183e-06} [Rank 1] Trainer log: {'loss': 0.9105, 'grad_norm': 2.3669135570526123, 'learning_rate': 3.490365328444183e-06} {'loss': 0.9105, 'grad_norm': 2.3669135570526123, 'learning_rate': 3.490365328444183e-06, 'epoch': 0.74} [Rank 0] Trainer log: {'loss': 0.782, 'grad_norm': 3.664670705795288, 'learning_rate': 3.485190346940552e-06}[Rank 3] Trainer log: {'loss': 0.782, 'grad_norm': 3.664670705795288, 'learning_rate': 3.485190346940552e-06} [Rank 1] Trainer log: {'loss': 0.782, 'grad_norm': 3.664670705795288, 'learning_rate': 3.485190346940552e-06} [Rank 2] Trainer log: {'loss': 0.782, 'grad_norm': 3.664670705795288, 'learning_rate': 3.485190346940552e-06} {'loss': 0.782, 'grad_norm': 3.664670705795288, 'learning_rate': 3.485190346940552e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.7522, 'grad_norm': 4.603105545043945, 'learning_rate': 3.4800183948956147e-06}[Rank 0] Trainer log: {'loss': 0.7522, 'grad_norm': 4.603105545043945, 'learning_rate': 3.4800183948956147e-06}[Rank 3] Trainer log: {'loss': 0.7522, 'grad_norm': 4.603105545043945, 'learning_rate': 3.4800183948956147e-06} [Rank 1] Trainer log: {'loss': 0.7522, 'grad_norm': 4.603105545043945, 'learning_rate': 3.4800183948956147e-06} {'loss': 0.7522, 'grad_norm': 4.603105545043945, 'learning_rate': 3.4800183948956147e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 1.0095, 'grad_norm': 4.932697296142578, 'learning_rate': 3.474849474714381e-06}[Rank 2] Trainer log: {'loss': 1.0095, 'grad_norm': 4.932697296142578, 'learning_rate': 3.474849474714381e-06}[Rank 0] Trainer log: {'loss': 1.0095, 'grad_norm': 4.932697296142578, 'learning_rate': 3.474849474714381e-06} [Rank 1] Trainer log: {'loss': 1.0095, 'grad_norm': 4.932697296142578, 'learning_rate': 3.474849474714381e-06} {'loss': 1.0095, 'grad_norm': 4.932697296142578, 'learning_rate': 3.474849474714381e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 1.158, 'grad_norm': 3.007347822189331, 'learning_rate': 3.469683588800462e-06}[Rank 3] Trainer log: {'loss': 1.158, 'grad_norm': 3.007347822189331, 'learning_rate': 3.469683588800462e-06} [Rank 1] Trainer log: {'loss': 1.158, 'grad_norm': 3.007347822189331, 'learning_rate': 3.469683588800462e-06} [Rank 0] Trainer log: {'loss': 1.158, 'grad_norm': 3.007347822189331, 'learning_rate': 3.469683588800462e-06} {'loss': 1.158, 'grad_norm': 3.007347822189331, 'learning_rate': 3.469683588800462e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.7157, 'grad_norm': 3.8820855617523193, 'learning_rate': 3.464520739556053e-06}[Rank 2] Trainer log: {'loss': 0.7157, 'grad_norm': 3.8820855617523193, 'learning_rate': 3.464520739556053e-06}[Rank 0] Trainer log: {'loss': 0.7157, 'grad_norm': 3.8820855617523193, 'learning_rate': 3.464520739556053e-06} [Rank 1] Trainer log: {'loss': 0.7157, 'grad_norm': 3.8820855617523193, 'learning_rate': 3.464520739556053e-06} {'loss': 0.7157, 'grad_norm': 3.8820855617523193, 'learning_rate': 3.464520739556053e-06, 'epoch': 0.74} [Rank 0] Trainer log: {'loss': 0.9804, 'grad_norm': 4.377410411834717, 'learning_rate': 3.459360929381931e-06}[Rank 3] Trainer log: {'loss': 0.9804, 'grad_norm': 4.377410411834717, 'learning_rate': 3.459360929381931e-06}[Rank 2] Trainer log: {'loss': 0.9804, 'grad_norm': 4.377410411834717, 'learning_rate': 3.459360929381931e-06} [Rank 1] Trainer log: {'loss': 0.9804, 'grad_norm': 4.377410411834717, 'learning_rate': 3.459360929381931e-06} {'loss': 0.9804, 'grad_norm': 4.377410411834717, 'learning_rate': 3.459360929381931e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.8122, 'grad_norm': 9.639141082763672, 'learning_rate': 3.454204160677471e-06} [Rank 0] Trainer log: {'loss': 0.8122, 'grad_norm': 9.639141082763672, 'learning_rate': 3.454204160677471e-06}[Rank 2] Trainer log: {'loss': 0.8122, 'grad_norm': 9.639141082763672, 'learning_rate': 3.454204160677471e-06} [Rank 1] Trainer log: {'loss': 0.8122, 'grad_norm': 9.639141082763672, 'learning_rate': 3.454204160677471e-06} {'loss': 0.8122, 'grad_norm': 9.639141082763672, 'learning_rate': 3.454204160677471e-06, 'epoch': 0.74} [Rank 0] Trainer log: {'loss': 0.9177, 'grad_norm': 3.7032346725463867, 'learning_rate': 3.449050435840624e-06}[Rank 3] Trainer log: {'loss': 0.9177, 'grad_norm': 3.7032346725463867, 'learning_rate': 3.449050435840624e-06}[Rank 2] Trainer log: {'loss': 0.9177, 'grad_norm': 3.7032346725463867, 'learning_rate': 3.449050435840624e-06} [Rank 1] Trainer log: {'loss': 0.9177, 'grad_norm': 3.7032346725463867, 'learning_rate': 3.449050435840624e-06} {'loss': 0.9177, 'grad_norm': 3.7032346725463867, 'learning_rate': 3.449050435840624e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.5521, 'grad_norm': 5.358617782592773, 'learning_rate': 3.4438997572679267e-06} [Rank 3] Trainer log: {'loss': 0.5521, 'grad_norm': 5.358617782592773, 'learning_rate': 3.4438997572679267e-06} [Rank 1] Trainer log: {'loss': 0.5521, 'grad_norm': 5.358617782592773, 'learning_rate': 3.4438997572679267e-06} [Rank 0] Trainer log: {'loss': 0.5521, 'grad_norm': 5.358617782592773, 'learning_rate': 3.4438997572679267e-06} {'loss': 0.5521, 'grad_norm': 5.358617782592773, 'learning_rate': 3.4438997572679267e-06, 'epoch': 0.74} [Rank 1] Trainer log: {'loss': 0.7958, 'grad_norm': 5.485546588897705, 'learning_rate': 3.4387521273545075e-06}[Rank 2] Trainer log: {'loss': 0.7958, 'grad_norm': 5.485546588897705, 'learning_rate': 3.4387521273545075e-06} [Rank 3] Trainer log: {'loss': 0.7958, 'grad_norm': 5.485546588897705, 'learning_rate': 3.4387521273545075e-06} [Rank 0] Trainer log: {'loss': 0.7958, 'grad_norm': 5.485546588897705, 'learning_rate': 3.4387521273545075e-06} {'loss': 0.7958, 'grad_norm': 5.485546588897705, 'learning_rate': 3.4387521273545075e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8229, 'grad_norm': 4.639723300933838, 'learning_rate': 3.433607548494068e-06} [Rank 3] Trainer log: {'loss': 0.8229, 'grad_norm': 4.639723300933838, 'learning_rate': 3.433607548494068e-06}[Rank 0] Trainer log: {'loss': 0.8229, 'grad_norm': 4.639723300933838, 'learning_rate': 3.433607548494068e-06} [Rank 1] Trainer log: {'loss': 0.8229, 'grad_norm': 4.639723300933838, 'learning_rate': 3.433607548494068e-06} {'loss': 0.8229, 'grad_norm': 4.639723300933838, 'learning_rate': 3.433607548494068e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.7341, 'grad_norm': 2.601806640625, 'learning_rate': 3.4284660230788926e-06} [Rank 1] Trainer log: {'loss': 0.7341, 'grad_norm': 2.601806640625, 'learning_rate': 3.4284660230788926e-06} [Rank 0] Trainer log: {'loss': 0.7341, 'grad_norm': 2.601806640625, 'learning_rate': 3.4284660230788926e-06}[Rank 3] Trainer log: {'loss': 0.7341, 'grad_norm': 2.601806640625, 'learning_rate': 3.4284660230788926e-06} {'loss': 0.7341, 'grad_norm': 2.601806640625, 'learning_rate': 3.4284660230788926e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.7554, 'grad_norm': 2.507112979888916, 'learning_rate': 3.423327553499848e-06}[Rank 2] Trainer log: {'loss': 0.7554, 'grad_norm': 2.507112979888916, 'learning_rate': 3.423327553499848e-06} [Rank 1] Trainer log: {'loss': 0.7554, 'grad_norm': 2.507112979888916, 'learning_rate': 3.423327553499848e-06} [Rank 0] Trainer log: {'loss': 0.7554, 'grad_norm': 2.507112979888916, 'learning_rate': 3.423327553499848e-06} {'loss': 0.7554, 'grad_norm': 2.507112979888916, 'learning_rate': 3.423327553499848e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.9216, 'grad_norm': 3.5247037410736084, 'learning_rate': 3.4181921421463857e-06} [Rank 3] Trainer log: {'loss': 0.9216, 'grad_norm': 3.5247037410736084, 'learning_rate': 3.4181921421463857e-06} [Rank 0] Trainer log: {'loss': 0.9216, 'grad_norm': 3.5247037410736084, 'learning_rate': 3.4181921421463857e-06}[Rank 1] Trainer log: {'loss': 0.9216, 'grad_norm': 3.5247037410736084, 'learning_rate': 3.4181921421463857e-06} {'loss': 0.9216, 'grad_norm': 3.5247037410736084, 'learning_rate': 3.4181921421463857e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.6843, 'grad_norm': 4.367401123046875, 'learning_rate': 3.413059791406521e-06}[Rank 1] Trainer log: {'loss': 0.6843, 'grad_norm': 4.367401123046875, 'learning_rate': 3.413059791406521e-06}[Rank 0] Trainer log: {'loss': 0.6843, 'grad_norm': 4.367401123046875, 'learning_rate': 3.413059791406521e-06} [Rank 3] Trainer log: {'loss': 0.6843, 'grad_norm': 4.367401123046875, 'learning_rate': 3.413059791406521e-06} {'loss': 0.6843, 'grad_norm': 4.367401123046875, 'learning_rate': 3.413059791406521e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.8392, 'grad_norm': 5.662171363830566, 'learning_rate': 3.4079305036668595e-06} [Rank 2] Trainer log: {'loss': 0.8392, 'grad_norm': 5.662171363830566, 'learning_rate': 3.4079305036668595e-06} [Rank 1] Trainer log: {'loss': 0.8392, 'grad_norm': 5.662171363830566, 'learning_rate': 3.4079305036668595e-06} [Rank 0] Trainer log: {'loss': 0.8392, 'grad_norm': 5.662171363830566, 'learning_rate': 3.4079305036668595e-06} {'loss': 0.8392, 'grad_norm': 5.662171363830566, 'learning_rate': 3.4079305036668595e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.6437, 'grad_norm': 7.358058929443359, 'learning_rate': 3.402804281312576e-06}[Rank 3] Trainer log: {'loss': 0.6437, 'grad_norm': 7.358058929443359, 'learning_rate': 3.402804281312576e-06} [Rank 0] Trainer log: {'loss': 0.6437, 'grad_norm': 7.358058929443359, 'learning_rate': 3.402804281312576e-06} [Rank 1] Trainer log: {'loss': 0.6437, 'grad_norm': 7.358058929443359, 'learning_rate': 3.402804281312576e-06} {'loss': 0.6437, 'grad_norm': 7.358058929443359, 'learning_rate': 3.402804281312576e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.913, 'grad_norm': 4.691687107086182, 'learning_rate': 3.397681126727416e-06} [Rank 1] Trainer log: {'loss': 0.913, 'grad_norm': 4.691687107086182, 'learning_rate': 3.397681126727416e-06} [Rank 0] Trainer log: {'loss': 0.913, 'grad_norm': 4.691687107086182, 'learning_rate': 3.397681126727416e-06}[Rank 3] Trainer log: {'loss': 0.913, 'grad_norm': 4.691687107086182, 'learning_rate': 3.397681126727416e-06} {'loss': 0.913, 'grad_norm': 4.691687107086182, 'learning_rate': 3.397681126727416e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8747, 'grad_norm': 12.653194427490234, 'learning_rate': 3.3925610422937094e-06}[Rank 3] Trainer log: {'loss': 0.8747, 'grad_norm': 12.653194427490234, 'learning_rate': 3.3925610422937094e-06}[Rank 0] Trainer log: {'loss': 0.8747, 'grad_norm': 12.653194427490234, 'learning_rate': 3.3925610422937094e-06} [Rank 1] Trainer log: {'loss': 0.8747, 'grad_norm': 12.653194427490234, 'learning_rate': 3.3925610422937094e-06} {'loss': 0.8747, 'grad_norm': 12.653194427490234, 'learning_rate': 3.3925610422937094e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8239, 'grad_norm': 12.991448402404785, 'learning_rate': 3.3874440303923493e-06}[Rank 0] Trainer log: {'loss': 0.8239, 'grad_norm': 12.991448402404785, 'learning_rate': 3.3874440303923493e-06}[Rank 3] Trainer log: {'loss': 0.8239, 'grad_norm': 12.991448402404785, 'learning_rate': 3.3874440303923493e-06} [Rank 1] Trainer log: {'loss': 0.8239, 'grad_norm': 12.991448402404785, 'learning_rate': 3.3874440303923493e-06} {'loss': 0.8239, 'grad_norm': 12.991448402404785, 'learning_rate': 3.3874440303923493e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8329, 'grad_norm': 5.421877384185791, 'learning_rate': 3.3823300934028e-06}[Rank 3] Trainer log: {'loss': 0.8329, 'grad_norm': 5.421877384185791, 'learning_rate': 3.3823300934028e-06} [Rank 1] Trainer log: {'loss': 0.8329, 'grad_norm': 5.421877384185791, 'learning_rate': 3.3823300934028e-06} [Rank 0] Trainer log: {'loss': 0.8329, 'grad_norm': 5.421877384185791, 'learning_rate': 3.3823300934028e-06} {'loss': 0.8329, 'grad_norm': 5.421877384185791, 'learning_rate': 3.3823300934028e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.704, 'grad_norm': 4.2170820236206055, 'learning_rate': 3.377219233703102e-06} [Rank 0] Trainer log: {'loss': 0.704, 'grad_norm': 4.2170820236206055, 'learning_rate': 3.377219233703102e-06}[Rank 1] Trainer log: {'loss': 0.704, 'grad_norm': 4.2170820236206055, 'learning_rate': 3.377219233703102e-06} [Rank 3] Trainer log: {'loss': 0.704, 'grad_norm': 4.2170820236206055, 'learning_rate': 3.377219233703102e-06} {'loss': 0.704, 'grad_norm': 4.2170820236206055, 'learning_rate': 3.377219233703102e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8625, 'grad_norm': 4.797454833984375, 'learning_rate': 3.372111453669864e-06}[Rank 0] Trainer log: {'loss': 0.8625, 'grad_norm': 4.797454833984375, 'learning_rate': 3.372111453669864e-06} [Rank 3] Trainer log: {'loss': 0.8625, 'grad_norm': 4.797454833984375, 'learning_rate': 3.372111453669864e-06} [Rank 1] Trainer log: {'loss': 0.8625, 'grad_norm': 4.797454833984375, 'learning_rate': 3.372111453669864e-06} {'loss': 0.8625, 'grad_norm': 4.797454833984375, 'learning_rate': 3.372111453669864e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.6571, 'grad_norm': 4.373620510101318, 'learning_rate': 3.367006755678258e-06} [Rank 1] Trainer log: {'loss': 0.6571, 'grad_norm': 4.373620510101318, 'learning_rate': 3.367006755678258e-06} [Rank 3] Trainer log: {'loss': 0.6571, 'grad_norm': 4.373620510101318, 'learning_rate': 3.367006755678258e-06} [Rank 0] Trainer log: {'loss': 0.6571, 'grad_norm': 4.373620510101318, 'learning_rate': 3.367006755678258e-06} {'loss': 0.6571, 'grad_norm': 4.373620510101318, 'learning_rate': 3.367006755678258e-06, 'epoch': 0.74} [Rank 1] Trainer log: {'loss': 0.9099, 'grad_norm': 3.870811939239502, 'learning_rate': 3.3619051421020223e-06}[Rank 3] Trainer log: {'loss': 0.9099, 'grad_norm': 3.870811939239502, 'learning_rate': 3.3619051421020223e-06}[Rank 2] Trainer log: {'loss': 0.9099, 'grad_norm': 3.870811939239502, 'learning_rate': 3.3619051421020223e-06} [Rank 0] Trainer log: {'loss': 0.9099, 'grad_norm': 3.870811939239502, 'learning_rate': 3.3619051421020223e-06} {'loss': 0.9099, 'grad_norm': 3.870811939239502, 'learning_rate': 3.3619051421020223e-06, 'epoch': 0.74} [Rank 3] Trainer log: {'loss': 0.835, 'grad_norm': 4.470117092132568, 'learning_rate': 3.3568066153134705e-06}[Rank 2] Trainer log: {'loss': 0.835, 'grad_norm': 4.470117092132568, 'learning_rate': 3.3568066153134705e-06} [Rank 0] Trainer log: {'loss': 0.835, 'grad_norm': 4.470117092132568, 'learning_rate': 3.3568066153134705e-06}[Rank 1] Trainer log: {'loss': 0.835, 'grad_norm': 4.470117092132568, 'learning_rate': 3.3568066153134705e-06} {'loss': 0.835, 'grad_norm': 4.470117092132568, 'learning_rate': 3.3568066153134705e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8749, 'grad_norm': 4.917026996612549, 'learning_rate': 3.3517111776834686e-06} [Rank 0] Trainer log: {'loss': 0.8749, 'grad_norm': 4.917026996612549, 'learning_rate': 3.3517111776834686e-06}[Rank 3] Trainer log: {'loss': 0.8749, 'grad_norm': 4.917026996612549, 'learning_rate': 3.3517111776834686e-06} [Rank 1] Trainer log: {'loss': 0.8749, 'grad_norm': 4.917026996612549, 'learning_rate': 3.3517111776834686e-06} {'loss': 0.8749, 'grad_norm': 4.917026996612549, 'learning_rate': 3.3517111776834686e-06, 'epoch': 0.74} [Rank 2] Trainer log: {'loss': 0.8594, 'grad_norm': 7.2642388343811035, 'learning_rate': 3.346618831581451e-06} [Rank 0] Trainer log: {'loss': 0.8594, 'grad_norm': 7.2642388343811035, 'learning_rate': 3.346618831581451e-06}[Rank 3] Trainer log: {'loss': 0.8594, 'grad_norm': 7.2642388343811035, 'learning_rate': 3.346618831581451e-06} [Rank 1] Trainer log: {'loss': 0.8594, 'grad_norm': 7.2642388343811035, 'learning_rate': 3.346618831581451e-06} {'loss': 0.8594, 'grad_norm': 7.2642388343811035, 'learning_rate': 3.346618831581451e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 1.0322, 'grad_norm': 4.61954402923584, 'learning_rate': 3.341529579375419e-06} [Rank 2] Trainer log: {'loss': 1.0322, 'grad_norm': 4.61954402923584, 'learning_rate': 3.341529579375419e-06} [Rank 0] Trainer log: {'loss': 1.0322, 'grad_norm': 4.61954402923584, 'learning_rate': 3.341529579375419e-06}[Rank 1] Trainer log: {'loss': 1.0322, 'grad_norm': 4.61954402923584, 'learning_rate': 3.341529579375419e-06} {'loss': 1.0322, 'grad_norm': 4.61954402923584, 'learning_rate': 3.341529579375419e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.8525, 'grad_norm': 11.21281623840332, 'learning_rate': 3.3364434234319267e-06}[Rank 3] Trainer log: {'loss': 0.8525, 'grad_norm': 11.21281623840332, 'learning_rate': 3.3364434234319267e-06}[Rank 2] Trainer log: {'loss': 0.8525, 'grad_norm': 11.21281623840332, 'learning_rate': 3.3364434234319267e-06} [Rank 0] Trainer log: {'loss': 0.8525, 'grad_norm': 11.21281623840332, 'learning_rate': 3.3364434234319267e-06} {'loss': 0.8525, 'grad_norm': 11.21281623840332, 'learning_rate': 3.3364434234319267e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.797, 'grad_norm': 8.129609107971191, 'learning_rate': 3.331360366116095e-06}[Rank 2] Trainer log: {'loss': 0.797, 'grad_norm': 8.129609107971191, 'learning_rate': 3.331360366116095e-06} [Rank 3] Trainer log: {'loss': 0.797, 'grad_norm': 8.129609107971191, 'learning_rate': 3.331360366116095e-06} [Rank 0] Trainer log: {'loss': 0.797, 'grad_norm': 8.129609107971191, 'learning_rate': 3.331360366116095e-06} {'loss': 0.797, 'grad_norm': 8.129609107971191, 'learning_rate': 3.331360366116095e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.9035, 'grad_norm': 2.723996162414551, 'learning_rate': 3.326280409791608e-06} [Rank 3] Trainer log: {'loss': 0.9035, 'grad_norm': 2.723996162414551, 'learning_rate': 3.326280409791608e-06}[Rank 2] Trainer log: {'loss': 0.9035, 'grad_norm': 2.723996162414551, 'learning_rate': 3.326280409791608e-06} [Rank 0] Trainer log: {'loss': 0.9035, 'grad_norm': 2.723996162414551, 'learning_rate': 3.326280409791608e-06} {'loss': 0.9035, 'grad_norm': 2.723996162414551, 'learning_rate': 3.326280409791608e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.6185, 'grad_norm': 6.878986835479736, 'learning_rate': 3.3212035568206947e-06}[Rank 0] Trainer log: {'loss': 0.6185, 'grad_norm': 6.878986835479736, 'learning_rate': 3.3212035568206947e-06}[Rank 3] Trainer log: {'loss': 0.6185, 'grad_norm': 6.878986835479736, 'learning_rate': 3.3212035568206947e-06} [Rank 1] Trainer log: {'loss': 0.6185, 'grad_norm': 6.878986835479736, 'learning_rate': 3.3212035568206947e-06} {'loss': 0.6185, 'grad_norm': 6.878986835479736, 'learning_rate': 3.3212035568206947e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 0.8681, 'grad_norm': 4.2721171379089355, 'learning_rate': 3.316129809564146e-06}[Rank 3] Trainer log: {'loss': 0.8681, 'grad_norm': 4.2721171379089355, 'learning_rate': 3.316129809564146e-06} [Rank 2] Trainer log: {'loss': 0.8681, 'grad_norm': 4.2721171379089355, 'learning_rate': 3.316129809564146e-06} [Rank 1] Trainer log: {'loss': 0.8681, 'grad_norm': 4.2721171379089355, 'learning_rate': 3.316129809564146e-06} {'loss': 0.8681, 'grad_norm': 4.2721171379089355, 'learning_rate': 3.316129809564146e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.8897, 'grad_norm': 7.376791954040527, 'learning_rate': 3.3110591703813176e-06} [Rank 3] Trainer log: {'loss': 0.8897, 'grad_norm': 7.376791954040527, 'learning_rate': 3.3110591703813176e-06} [Rank 0] Trainer log: {'loss': 0.8897, 'grad_norm': 7.376791954040527, 'learning_rate': 3.3110591703813176e-06}[Rank 1] Trainer log: {'loss': 0.8897, 'grad_norm': 7.376791954040527, 'learning_rate': 3.3110591703813176e-06} {'loss': 0.8897, 'grad_norm': 7.376791954040527, 'learning_rate': 3.3110591703813176e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.9319, 'grad_norm': 3.4933462142944336, 'learning_rate': 3.3059916416301098e-06} [Rank 0] Trainer log: {'loss': 0.9319, 'grad_norm': 3.4933462142944336, 'learning_rate': 3.3059916416301098e-06}[Rank 3] Trainer log: {'loss': 0.9319, 'grad_norm': 3.4933462142944336, 'learning_rate': 3.3059916416301098e-06} [Rank 1] Trainer log: {'loss': 0.9319, 'grad_norm': 3.4933462142944336, 'learning_rate': 3.3059916416301098e-06} {'loss': 0.9319, 'grad_norm': 3.4933462142944336, 'learning_rate': 3.3059916416301098e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.8869, 'grad_norm': 4.713348388671875, 'learning_rate': 3.3009272256669753e-06}[Rank 3] Trainer log: {'loss': 0.8869, 'grad_norm': 4.713348388671875, 'learning_rate': 3.3009272256669753e-06} [Rank 0] Trainer log: {'loss': 0.8869, 'grad_norm': 4.713348388671875, 'learning_rate': 3.3009272256669753e-06} [Rank 1] Trainer log: {'loss': 0.8869, 'grad_norm': 4.713348388671875, 'learning_rate': 3.3009272256669753e-06} {'loss': 0.8869, 'grad_norm': 4.713348388671875, 'learning_rate': 3.3009272256669753e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.7512, 'grad_norm': 6.069649696350098, 'learning_rate': 3.2958659248469327e-06}[Rank 3] Trainer log: {'loss': 0.7512, 'grad_norm': 6.069649696350098, 'learning_rate': 3.2958659248469327e-06}[Rank 0] Trainer log: {'loss': 0.7512, 'grad_norm': 6.069649696350098, 'learning_rate': 3.2958659248469327e-06} [Rank 1] Trainer log: {'loss': 0.7512, 'grad_norm': 6.069649696350098, 'learning_rate': 3.2958659248469327e-06} {'loss': 0.7512, 'grad_norm': 6.069649696350098, 'learning_rate': 3.2958659248469327e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.8029, 'grad_norm': 9.295422554016113, 'learning_rate': 3.2908077415235328e-06}[Rank 2] Trainer log: {'loss': 0.8029, 'grad_norm': 9.295422554016113, 'learning_rate': 3.2908077415235328e-06} [Rank 1] Trainer log: {'loss': 0.8029, 'grad_norm': 9.295422554016113, 'learning_rate': 3.2908077415235328e-06} [Rank 0] Trainer log: {'loss': 0.8029, 'grad_norm': 9.295422554016113, 'learning_rate': 3.2908077415235328e-06} {'loss': 0.8029, 'grad_norm': 9.295422554016113, 'learning_rate': 3.2908077415235328e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.8211, 'grad_norm': 6.31588888168335, 'learning_rate': 3.2857526780488925e-06}[Rank 2] Trainer log: {'loss': 0.8211, 'grad_norm': 6.31588888168335, 'learning_rate': 3.2857526780488925e-06} [Rank 0] Trainer log: {'loss': 0.8211, 'grad_norm': 6.31588888168335, 'learning_rate': 3.2857526780488925e-06} [Rank 1] Trainer log: {'loss': 0.8211, 'grad_norm': 6.31588888168335, 'learning_rate': 3.2857526780488925e-06} {'loss': 0.8211, 'grad_norm': 6.31588888168335, 'learning_rate': 3.2857526780488925e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.8546, 'grad_norm': 3.7529945373535156, 'learning_rate': 3.2807007367736756e-06}[Rank 3] Trainer log: {'loss': 0.8546, 'grad_norm': 3.7529945373535156, 'learning_rate': 3.2807007367736756e-06}[Rank 0] Trainer log: {'loss': 0.8546, 'grad_norm': 3.7529945373535156, 'learning_rate': 3.2807007367736756e-06} [Rank 1] Trainer log: {'loss': 0.8546, 'grad_norm': 3.7529945373535156, 'learning_rate': 3.2807007367736756e-06} {'loss': 0.8546, 'grad_norm': 3.7529945373535156, 'learning_rate': 3.2807007367736756e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.7558, 'grad_norm': 12.996456146240234, 'learning_rate': 3.275651920047087e-06}[Rank 1] Trainer log: {'loss': 0.7558, 'grad_norm': 12.996456146240234, 'learning_rate': 3.275651920047087e-06} [Rank 0] Trainer log: {'loss': 0.7558, 'grad_norm': 12.996456146240234, 'learning_rate': 3.275651920047087e-06}[Rank 3] Trainer log: {'loss': 0.7558, 'grad_norm': 12.996456146240234, 'learning_rate': 3.275651920047087e-06} {'loss': 0.7558, 'grad_norm': 12.996456146240234, 'learning_rate': 3.275651920047087e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.707, 'grad_norm': 2.502427101135254, 'learning_rate': 3.2706062302168794e-06} [Rank 3] Trainer log: {'loss': 0.707, 'grad_norm': 2.502427101135254, 'learning_rate': 3.2706062302168794e-06} [Rank 0] Trainer log: {'loss': 0.707, 'grad_norm': 2.502427101135254, 'learning_rate': 3.2706062302168794e-06} {'loss': 0.707, 'grad_norm': 2.502427101135254, 'learning_rate': 3.2706062302168794e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.707, 'grad_norm': 2.502427101135254, 'learning_rate': 3.2706062302168794e-06} [Rank 3] Trainer log: {'loss': 0.4409, 'grad_norm': 16.4840087890625, 'learning_rate': 3.2655636696293602e-06}[Rank 2] Trainer log: {'loss': 0.4409, 'grad_norm': 16.4840087890625, 'learning_rate': 3.2655636696293602e-06} [Rank 1] Trainer log: {'loss': 0.4409, 'grad_norm': 16.4840087890625, 'learning_rate': 3.2655636696293602e-06} [Rank 0] Trainer log: {'loss': 0.4409, 'grad_norm': 16.4840087890625, 'learning_rate': 3.2655636696293602e-06} {'loss': 0.4409, 'grad_norm': 16.4840087890625, 'learning_rate': 3.2655636696293602e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.9765, 'grad_norm': 1.694443702697754, 'learning_rate': 3.2605242406293736e-06} [Rank 3] Trainer log: {'loss': 0.9765, 'grad_norm': 1.694443702697754, 'learning_rate': 3.2605242406293736e-06}[Rank 0] Trainer log: {'loss': 0.9765, 'grad_norm': 1.694443702697754, 'learning_rate': 3.2605242406293736e-06} [Rank 1] Trainer log: {'loss': 0.9765, 'grad_norm': 1.694443702697754, 'learning_rate': 3.2605242406293736e-06} {'loss': 0.9765, 'grad_norm': 1.694443702697754, 'learning_rate': 3.2605242406293736e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.7521, 'grad_norm': 2.926010847091675, 'learning_rate': 3.2554879455603074e-06} [Rank 0] Trainer log: {'loss': 0.7521, 'grad_norm': 2.926010847091675, 'learning_rate': 3.2554879455603074e-06}[Rank 3] Trainer log: {'loss': 0.7521, 'grad_norm': 2.926010847091675, 'learning_rate': 3.2554879455603074e-06} [Rank 1] Trainer log: {'loss': 0.7521, 'grad_norm': 2.926010847091675, 'learning_rate': 3.2554879455603074e-06} {'loss': 0.7521, 'grad_norm': 2.926010847091675, 'learning_rate': 3.2554879455603074e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 0.9002, 'grad_norm': 3.600437641143799, 'learning_rate': 3.250454786764099e-06}[Rank 2] Trainer log: {'loss': 0.9002, 'grad_norm': 3.600437641143799, 'learning_rate': 3.250454786764099e-06}[Rank 1] Trainer log: {'loss': 0.9002, 'grad_norm': 3.600437641143799, 'learning_rate': 3.250454786764099e-06} [Rank 3] Trainer log: {'loss': 0.9002, 'grad_norm': 3.600437641143799, 'learning_rate': 3.250454786764099e-06} {'loss': 0.9002, 'grad_norm': 3.600437641143799, 'learning_rate': 3.250454786764099e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 0.7977, 'grad_norm': 4.442351818084717, 'learning_rate': 3.2454247665812176e-06} [Rank 1] Trainer log: {'loss': 0.7977, 'grad_norm': 4.442351818084717, 'learning_rate': 3.2454247665812176e-06} [Rank 2] Trainer log: {'loss': 0.7977, 'grad_norm': 4.442351818084717, 'learning_rate': 3.2454247665812176e-06} [Rank 3] Trainer log: {'loss': 0.7977, 'grad_norm': 4.442351818084717, 'learning_rate': 3.2454247665812176e-06} {'loss': 0.7977, 'grad_norm': 4.442351818084717, 'learning_rate': 3.2454247665812176e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 0.7439, 'grad_norm': 11.293764114379883, 'learning_rate': 3.240397887350686e-06}[Rank 3] Trainer log: {'loss': 0.7439, 'grad_norm': 11.293764114379883, 'learning_rate': 3.240397887350686e-06} [Rank 2] Trainer log: {'loss': 0.7439, 'grad_norm': 11.293764114379883, 'learning_rate': 3.240397887350686e-06} [Rank 1] Trainer log: {'loss': 0.7439, 'grad_norm': 11.293764114379883, 'learning_rate': 3.240397887350686e-06} {'loss': 0.7439, 'grad_norm': 11.293764114379883, 'learning_rate': 3.240397887350686e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.8756, 'grad_norm': 6.784129619598389, 'learning_rate': 3.2353741514100503e-06}[Rank 2] Trainer log: {'loss': 0.8756, 'grad_norm': 6.784129619598389, 'learning_rate': 3.2353741514100503e-06} [Rank 3] Trainer log: {'loss': 0.8756, 'grad_norm': 6.784129619598389, 'learning_rate': 3.2353741514100503e-06} [Rank 0] Trainer log: {'loss': 0.8756, 'grad_norm': 6.784129619598389, 'learning_rate': 3.2353741514100503e-06} {'loss': 0.8756, 'grad_norm': 6.784129619598389, 'learning_rate': 3.2353741514100503e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.6246, 'grad_norm': 5.401147365570068, 'learning_rate': 3.2303535610954107e-06} [Rank 0] Trainer log: {'loss': 0.6246, 'grad_norm': 5.401147365570068, 'learning_rate': 3.2303535610954107e-06}[Rank 3] Trainer log: {'loss': 0.6246, 'grad_norm': 5.401147365570068, 'learning_rate': 3.2303535610954107e-06} [Rank 2] Trainer log: {'loss': 0.6246, 'grad_norm': 5.401147365570068, 'learning_rate': 3.2303535610954107e-06} {'loss': 0.6246, 'grad_norm': 5.401147365570068, 'learning_rate': 3.2303535610954107e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 1.0735, 'grad_norm': 3.8424484729766846, 'learning_rate': 3.2253361187413946e-06} [Rank 2] Trainer log: {'loss': 1.0735, 'grad_norm': 3.8424484729766846, 'learning_rate': 3.2253361187413946e-06} [Rank 1] Trainer log: {'loss': 1.0735, 'grad_norm': 3.8424484729766846, 'learning_rate': 3.2253361187413946e-06} [Rank 0] Trainer log: {'loss': 1.0735, 'grad_norm': 3.8424484729766846, 'learning_rate': 3.2253361187413946e-06} {'loss': 1.0735, 'grad_norm': 3.8424484729766846, 'learning_rate': 3.2253361187413946e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.7238, 'grad_norm': 5.7342634201049805, 'learning_rate': 3.220321826681164e-06} [Rank 2] Trainer log: {'loss': 0.7238, 'grad_norm': 5.7342634201049805, 'learning_rate': 3.220321826681164e-06} [Rank 0] Trainer log: {'loss': 0.7238, 'grad_norm': 5.7342634201049805, 'learning_rate': 3.220321826681164e-06}[Rank 1] Trainer log: {'loss': 0.7238, 'grad_norm': 5.7342634201049805, 'learning_rate': 3.220321826681164e-06} {'loss': 0.7238, 'grad_norm': 5.7342634201049805, 'learning_rate': 3.220321826681164e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 0.8319, 'grad_norm': 16.190114974975586, 'learning_rate': 3.215310687246428e-06}[Rank 3] Trainer log: {'loss': 0.8319, 'grad_norm': 16.190114974975586, 'learning_rate': 3.215310687246428e-06}[Rank 1] Trainer log: {'loss': 0.8319, 'grad_norm': 16.190114974975586, 'learning_rate': 3.215310687246428e-06} [Rank 2] Trainer log: {'loss': 0.8319, 'grad_norm': 16.190114974975586, 'learning_rate': 3.215310687246428e-06} {'loss': 0.8319, 'grad_norm': 16.190114974975586, 'learning_rate': 3.215310687246428e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.849, 'grad_norm': 3.4482240676879883, 'learning_rate': 3.2103027027674185e-06}[Rank 0] Trainer log: {'loss': 0.849, 'grad_norm': 3.4482240676879883, 'learning_rate': 3.2103027027674185e-06} [Rank 3] Trainer log: {'loss': 0.849, 'grad_norm': 3.4482240676879883, 'learning_rate': 3.2103027027674185e-06} [Rank 1] Trainer log: {'loss': 0.849, 'grad_norm': 3.4482240676879883, 'learning_rate': 3.2103027027674185e-06} {'loss': 0.849, 'grad_norm': 3.4482240676879883, 'learning_rate': 3.2103027027674185e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.5985, 'grad_norm': 4.522181034088135, 'learning_rate': 3.2052978755728993e-06} [Rank 2] Trainer log: {'loss': 0.5985, 'grad_norm': 4.522181034088135, 'learning_rate': 3.2052978755728993e-06} [Rank 1] Trainer log: {'loss': 0.5985, 'grad_norm': 4.522181034088135, 'learning_rate': 3.2052978755728993e-06} [Rank 0] Trainer log: {'loss': 0.5985, 'grad_norm': 4.522181034088135, 'learning_rate': 3.2052978755728993e-06} {'loss': 0.5985, 'grad_norm': 4.522181034088135, 'learning_rate': 3.2052978755728993e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.9057, 'grad_norm': 2.13624906539917, 'learning_rate': 3.2002962079901743e-06}[Rank 3] Trainer log: {'loss': 0.9057, 'grad_norm': 2.13624906539917, 'learning_rate': 3.2002962079901743e-06} [Rank 0] Trainer log: {'loss': 0.9057, 'grad_norm': 2.13624906539917, 'learning_rate': 3.2002962079901743e-06} [Rank 1] Trainer log: {'loss': 0.9057, 'grad_norm': 2.13624906539917, 'learning_rate': 3.2002962079901743e-06} {'loss': 0.9057, 'grad_norm': 2.13624906539917, 'learning_rate': 3.2002962079901743e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.7063, 'grad_norm': 3.778510808944702, 'learning_rate': 3.195297702345079e-06} [Rank 3] Trainer log: {'loss': 0.7063, 'grad_norm': 3.778510808944702, 'learning_rate': 3.195297702345079e-06} [Rank 0] Trainer log: {'loss': 0.7063, 'grad_norm': 3.778510808944702, 'learning_rate': 3.195297702345079e-06}[Rank 1] Trainer log: {'loss': 0.7063, 'grad_norm': 3.778510808944702, 'learning_rate': 3.195297702345079e-06} {'loss': 0.7063, 'grad_norm': 3.778510808944702, 'learning_rate': 3.195297702345079e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 0.8957, 'grad_norm': 7.46593713760376, 'learning_rate': 3.1903023609619654e-06}[Rank 2] Trainer log: {'loss': 0.8957, 'grad_norm': 7.46593713760376, 'learning_rate': 3.1903023609619654e-06} [Rank 3] Trainer log: {'loss': 0.8957, 'grad_norm': 7.46593713760376, 'learning_rate': 3.1903023609619654e-06} [Rank 1] Trainer log: {'loss': 0.8957, 'grad_norm': 7.46593713760376, 'learning_rate': 3.1903023609619654e-06} {'loss': 0.8957, 'grad_norm': 7.46593713760376, 'learning_rate': 3.1903023609619654e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 0.7022, 'grad_norm': 2.6058409214019775, 'learning_rate': 3.1853101861637303e-06}[Rank 2] Trainer log: {'loss': 0.7022, 'grad_norm': 2.6058409214019775, 'learning_rate': 3.1853101861637303e-06}[Rank 3] Trainer log: {'loss': 0.7022, 'grad_norm': 2.6058409214019775, 'learning_rate': 3.1853101861637303e-06} [Rank 1] Trainer log: {'loss': 0.7022, 'grad_norm': 2.6058409214019775, 'learning_rate': 3.1853101861637303e-06} {'loss': 0.7022, 'grad_norm': 2.6058409214019775, 'learning_rate': 3.1853101861637303e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.753, 'grad_norm': 8.448689460754395, 'learning_rate': 3.1803211802717883e-06}[Rank 2] Trainer log: {'loss': 0.753, 'grad_norm': 8.448689460754395, 'learning_rate': 3.1803211802717883e-06} [Rank 1] Trainer log: {'loss': 0.753, 'grad_norm': 8.448689460754395, 'learning_rate': 3.1803211802717883e-06} [Rank 0] Trainer log: {'loss': 0.753, 'grad_norm': 8.448689460754395, 'learning_rate': 3.1803211802717883e-06} {'loss': 0.753, 'grad_norm': 8.448689460754395, 'learning_rate': 3.1803211802717883e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.9536, 'grad_norm': 2.57907772064209, 'learning_rate': 3.1753353456060755e-06}[Rank 2] Trainer log: {'loss': 0.9536, 'grad_norm': 2.57907772064209, 'learning_rate': 3.1753353456060755e-06} [Rank 0] Trainer log: {'loss': 0.9536, 'grad_norm': 2.57907772064209, 'learning_rate': 3.1753353456060755e-06}[Rank 1] Trainer log: {'loss': 0.9536, 'grad_norm': 2.57907772064209, 'learning_rate': 3.1753353456060755e-06} {'loss': 0.9536, 'grad_norm': 2.57907772064209, 'learning_rate': 3.1753353456060755e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.9164, 'grad_norm': 4.81142520904541, 'learning_rate': 3.170352684485071e-06}[Rank 3] Trainer log: {'loss': 0.9164, 'grad_norm': 4.81142520904541, 'learning_rate': 3.170352684485071e-06}[Rank 2] Trainer log: {'loss': 0.9164, 'grad_norm': 4.81142520904541, 'learning_rate': 3.170352684485071e-06} [Rank 0] Trainer log: {'loss': 0.9164, 'grad_norm': 4.81142520904541, 'learning_rate': 3.170352684485071e-06} {'loss': 0.9164, 'grad_norm': 4.81142520904541, 'learning_rate': 3.170352684485071e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.7717, 'grad_norm': 3.658052921295166, 'learning_rate': 3.1653731992257606e-06} [Rank 2] Trainer log: {'loss': 0.7717, 'grad_norm': 3.658052921295166, 'learning_rate': 3.1653731992257606e-06} [Rank 0] Trainer log: {'loss': 0.7717, 'grad_norm': 3.658052921295166, 'learning_rate': 3.1653731992257606e-06}[Rank 3] Trainer log: {'loss': 0.7717, 'grad_norm': 3.658052921295166, 'learning_rate': 3.1653731992257606e-06} {'loss': 0.7717, 'grad_norm': 3.658052921295166, 'learning_rate': 3.1653731992257606e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.9773, 'grad_norm': 2.3638195991516113, 'learning_rate': 3.160396892143659e-06} [Rank 2] Trainer log: {'loss': 0.9773, 'grad_norm': 2.3638195991516113, 'learning_rate': 3.160396892143659e-06}[Rank 0] Trainer log: {'loss': 0.9773, 'grad_norm': 2.3638195991516113, 'learning_rate': 3.160396892143659e-06} [Rank 1] Trainer log: {'loss': 0.9773, 'grad_norm': 2.3638195991516113, 'learning_rate': 3.160396892143659e-06} {'loss': 0.9773, 'grad_norm': 2.3638195991516113, 'learning_rate': 3.160396892143659e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.4996, 'grad_norm': 3.5916061401367188, 'learning_rate': 3.1554237655528076e-06} [Rank 2] Trainer log: {'loss': 0.4996, 'grad_norm': 3.5916061401367188, 'learning_rate': 3.1554237655528076e-06} [Rank 3] Trainer log: {'loss': 0.4996, 'grad_norm': 3.5916061401367188, 'learning_rate': 3.1554237655528076e-06} [Rank 0] Trainer log: {'loss': 0.4996, 'grad_norm': 3.5916061401367188, 'learning_rate': 3.1554237655528076e-06} {'loss': 0.4996, 'grad_norm': 3.5916061401367188, 'learning_rate': 3.1554237655528076e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.9181, 'grad_norm': 12.177791595458984, 'learning_rate': 3.1504538217657663e-06}[Rank 3] Trainer log: {'loss': 0.9181, 'grad_norm': 12.177791595458984, 'learning_rate': 3.1504538217657663e-06} [Rank 1] Trainer log: {'loss': 0.9181, 'grad_norm': 12.177791595458984, 'learning_rate': 3.1504538217657663e-06} [Rank 0] Trainer log: {'loss': 0.9181, 'grad_norm': 12.177791595458984, 'learning_rate': 3.1504538217657663e-06} {'loss': 0.9181, 'grad_norm': 12.177791595458984, 'learning_rate': 3.1504538217657663e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.9114, 'grad_norm': 5.445841312408447, 'learning_rate': 3.1454870630936107e-06}[Rank 3] Trainer log: {'loss': 0.9114, 'grad_norm': 5.445841312408447, 'learning_rate': 3.1454870630936107e-06} [Rank 1] Trainer log: {'loss': 0.9114, 'grad_norm': 5.445841312408447, 'learning_rate': 3.1454870630936107e-06} [Rank 0] Trainer log: {'loss': 0.9114, 'grad_norm': 5.445841312408447, 'learning_rate': 3.1454870630936107e-06} {'loss': 0.9114, 'grad_norm': 5.445841312408447, 'learning_rate': 3.1454870630936107e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 1.0382, 'grad_norm': 2.89334774017334, 'learning_rate': 3.140523491845943e-06}[Rank 3] Trainer log: {'loss': 1.0382, 'grad_norm': 2.89334774017334, 'learning_rate': 3.140523491845943e-06}[Rank 2] Trainer log: {'loss': 1.0382, 'grad_norm': 2.89334774017334, 'learning_rate': 3.140523491845943e-06} [Rank 1] Trainer log: {'loss': 1.0382, 'grad_norm': 2.89334774017334, 'learning_rate': 3.140523491845943e-06} {'loss': 1.0382, 'grad_norm': 2.89334774017334, 'learning_rate': 3.140523491845943e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.71, 'grad_norm': 2.037142276763916, 'learning_rate': 3.135563110330876e-06}[Rank 2] Trainer log: {'loss': 0.71, 'grad_norm': 2.037142276763916, 'learning_rate': 3.135563110330876e-06} [Rank 1] Trainer log: {'loss': 0.71, 'grad_norm': 2.037142276763916, 'learning_rate': 3.135563110330876e-06} [Rank 0] Trainer log: {'loss': 0.71, 'grad_norm': 2.037142276763916, 'learning_rate': 3.135563110330876e-06} {'loss': 0.71, 'grad_norm': 2.037142276763916, 'learning_rate': 3.135563110330876e-06, 'epoch': 0.75} [Rank 0] Trainer log: {'loss': 0.6748, 'grad_norm': 3.355199098587036, 'learning_rate': 3.1306059208550387e-06}[Rank 2] Trainer log: {'loss': 0.6748, 'grad_norm': 3.355199098587036, 'learning_rate': 3.1306059208550387e-06} [Rank 3] Trainer log: {'loss': 0.6748, 'grad_norm': 3.355199098587036, 'learning_rate': 3.1306059208550387e-06} [Rank 1] Trainer log: {'loss': 0.6748, 'grad_norm': 3.355199098587036, 'learning_rate': 3.1306059208550387e-06} {'loss': 0.6748, 'grad_norm': 3.355199098587036, 'learning_rate': 3.1306059208550387e-06, 'epoch': 0.75} [Rank 3] Trainer log: {'loss': 0.4777, 'grad_norm': 3.860050916671753, 'learning_rate': 3.1256519257235863e-06}[Rank 2] Trainer log: {'loss': 0.4777, 'grad_norm': 3.860050916671753, 'learning_rate': 3.1256519257235863e-06} [Rank 1] Trainer log: {'loss': 0.4777, 'grad_norm': 3.860050916671753, 'learning_rate': 3.1256519257235863e-06} [Rank 0] Trainer log: {'loss': 0.4777, 'grad_norm': 3.860050916671753, 'learning_rate': 3.1256519257235863e-06} {'loss': 0.4777, 'grad_norm': 3.860050916671753, 'learning_rate': 3.1256519257235863e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.6969, 'grad_norm': 2.678042411804199, 'learning_rate': 3.1207011272401745e-06}[Rank 0] Trainer log: {'loss': 0.6969, 'grad_norm': 2.678042411804199, 'learning_rate': 3.1207011272401745e-06} [Rank 3] Trainer log: {'loss': 0.6969, 'grad_norm': 2.678042411804199, 'learning_rate': 3.1207011272401745e-06} {'loss': 0.6969, 'grad_norm': 2.678042411804199, 'learning_rate': 3.1207011272401745e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.6969, 'grad_norm': 2.678042411804199, 'learning_rate': 3.1207011272401745e-06} [Rank 2] Trainer log: {'loss': 0.9617, 'grad_norm': 5.910046100616455, 'learning_rate': 3.115753527706986e-06}[Rank 3] Trainer log: {'loss': 0.9617, 'grad_norm': 5.910046100616455, 'learning_rate': 3.115753527706986e-06} [Rank 0] Trainer log: {'loss': 0.9617, 'grad_norm': 5.910046100616455, 'learning_rate': 3.115753527706986e-06} [Rank 1] Trainer log: {'loss': 0.9617, 'grad_norm': 5.910046100616455, 'learning_rate': 3.115753527706986e-06} {'loss': 0.9617, 'grad_norm': 5.910046100616455, 'learning_rate': 3.115753527706986e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.7276, 'grad_norm': 3.28489351272583, 'learning_rate': 3.1108091294247035e-06}[Rank 1] Trainer log: {'loss': 0.7276, 'grad_norm': 3.28489351272583, 'learning_rate': 3.1108091294247035e-06}[Rank 3] Trainer log: {'loss': 0.7276, 'grad_norm': 3.28489351272583, 'learning_rate': 3.1108091294247035e-06} [Rank 0] Trainer log: {'loss': 0.7276, 'grad_norm': 3.28489351272583, 'learning_rate': 3.1108091294247035e-06} {'loss': 0.7276, 'grad_norm': 3.28489351272583, 'learning_rate': 3.1108091294247035e-06, 'epoch': 0.75} [Rank 2] Trainer log: {'loss': 0.7022, 'grad_norm': 3.1342363357543945, 'learning_rate': 3.1058679346925326e-06}[Rank 1] Trainer log: {'loss': 0.7022, 'grad_norm': 3.1342363357543945, 'learning_rate': 3.1058679346925326e-06}[Rank 3] Trainer log: {'loss': 0.7022, 'grad_norm': 3.1342363357543945, 'learning_rate': 3.1058679346925326e-06} [Rank 0] Trainer log: {'loss': 0.7022, 'grad_norm': 3.1342363357543945, 'learning_rate': 3.1058679346925326e-06} {'loss': 0.7022, 'grad_norm': 3.1342363357543945, 'learning_rate': 3.1058679346925326e-06, 'epoch': 0.75} [Rank 1] Trainer log: {'loss': 0.8356, 'grad_norm': 8.109475135803223, 'learning_rate': 3.100929945808181e-06}[Rank 0] Trainer log: {'loss': 0.8356, 'grad_norm': 8.109475135803223, 'learning_rate': 3.100929945808181e-06}[Rank 3] Trainer log: {'loss': 0.8356, 'grad_norm': 8.109475135803223, 'learning_rate': 3.100929945808181e-06} [Rank 2] Trainer log: {'loss': 0.8356, 'grad_norm': 8.109475135803223, 'learning_rate': 3.100929945808181e-06} {'loss': 0.8356, 'grad_norm': 8.109475135803223, 'learning_rate': 3.100929945808181e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.683, 'grad_norm': 4.369258403778076, 'learning_rate': 3.0959951650678645e-06}[Rank 1] Trainer log: {'loss': 0.683, 'grad_norm': 4.369258403778076, 'learning_rate': 3.0959951650678645e-06}[Rank 3] Trainer log: {'loss': 0.683, 'grad_norm': 4.369258403778076, 'learning_rate': 3.0959951650678645e-06} [Rank 0] Trainer log: {'loss': 0.683, 'grad_norm': 4.369258403778076, 'learning_rate': 3.0959951650678645e-06} {'loss': 0.683, 'grad_norm': 4.369258403778076, 'learning_rate': 3.0959951650678645e-06, 'epoch': 0.76} [Rank 0] Trainer log: {'loss': 1.0259, 'grad_norm': 5.1977925300598145, 'learning_rate': 3.091063594766317e-06}[Rank 2] Trainer log: {'loss': 1.0259, 'grad_norm': 5.1977925300598145, 'learning_rate': 3.091063594766317e-06} [Rank 3] Trainer log: {'loss': 1.0259, 'grad_norm': 5.1977925300598145, 'learning_rate': 3.091063594766317e-06} {'loss': 1.0259, 'grad_norm': 5.1977925300598145, 'learning_rate': 3.091063594766317e-06, 'epoch': 0.76} [Rank 1] Trainer log: {'loss': 1.0259, 'grad_norm': 5.1977925300598145, 'learning_rate': 3.091063594766317e-06} [Rank 2] Trainer log: {'loss': 0.7524, 'grad_norm': 4.615601062774658, 'learning_rate': 3.086135237196771e-06} [Rank 3] Trainer log: {'loss': 0.7524, 'grad_norm': 4.615601062774658, 'learning_rate': 3.086135237196771e-06} [Rank 1] Trainer log: {'loss': 0.7524, 'grad_norm': 4.615601062774658, 'learning_rate': 3.086135237196771e-06} [Rank 0] Trainer log: {'loss': 0.7524, 'grad_norm': 4.615601062774658, 'learning_rate': 3.086135237196771e-06} {'loss': 0.7524, 'grad_norm': 4.615601062774658, 'learning_rate': 3.086135237196771e-06, 'epoch': 0.76} [Rank 3] Trainer log: {'loss': 0.7703, 'grad_norm': 2.5928685665130615, 'learning_rate': 3.0812100946509637e-06}[Rank 2] Trainer log: {'loss': 0.7703, 'grad_norm': 2.5928685665130615, 'learning_rate': 3.0812100946509637e-06} [Rank 1] Trainer log: {'loss': 0.7703, 'grad_norm': 2.5928685665130615, 'learning_rate': 3.0812100946509637e-06} [Rank 0] Trainer log: {'loss': 0.7703, 'grad_norm': 2.5928685665130615, 'learning_rate': 3.0812100946509637e-06} {'loss': 0.7703, 'grad_norm': 2.5928685665130615, 'learning_rate': 3.0812100946509637e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8193, 'grad_norm': 8.310372352600098, 'learning_rate': 3.0762881694191448e-06} [Rank 3] Trainer log: {'loss': 0.8193, 'grad_norm': 8.310372352600098, 'learning_rate': 3.0762881694191448e-06} [Rank 0] Trainer log: {'loss': 0.8193, 'grad_norm': 8.310372352600098, 'learning_rate': 3.0762881694191448e-06} [Rank 1] Trainer log: {'loss': 0.8193, 'grad_norm': 8.310372352600098, 'learning_rate': 3.0762881694191448e-06} {'loss': 0.8193, 'grad_norm': 8.310372352600098, 'learning_rate': 3.0762881694191448e-06, 'epoch': 0.76} [Rank 1] Trainer log: {'loss': 0.8859, 'grad_norm': 4.066675662994385, 'learning_rate': 3.0713694637900672e-06}[Rank 0] Trainer log: {'loss': 0.8859, 'grad_norm': 4.066675662994385, 'learning_rate': 3.0713694637900672e-06} [Rank 3] Trainer log: {'loss': 0.8859, 'grad_norm': 4.066675662994385, 'learning_rate': 3.0713694637900672e-06} [Rank 2] Trainer log: {'loss': 0.8859, 'grad_norm': 4.066675662994385, 'learning_rate': 3.0713694637900672e-06} {'loss': 0.8859, 'grad_norm': 4.066675662994385, 'learning_rate': 3.0713694637900672e-06, 'epoch': 0.76} [Rank 0] Trainer log: {'loss': 0.7337, 'grad_norm': 8.924094200134277, 'learning_rate': 3.0664539800509763e-06}[Rank 3] Trainer log: {'loss': 0.7337, 'grad_norm': 8.924094200134277, 'learning_rate': 3.0664539800509763e-06}[Rank 2] Trainer log: {'loss': 0.7337, 'grad_norm': 8.924094200134277, 'learning_rate': 3.0664539800509763e-06} [Rank 1] Trainer log: {'loss': 0.7337, 'grad_norm': 8.924094200134277, 'learning_rate': 3.0664539800509763e-06} {'loss': 0.7337, 'grad_norm': 8.924094200134277, 'learning_rate': 3.0664539800509763e-06, 'epoch': 0.76} [Rank 0] Trainer log: {'loss': 0.6169, 'grad_norm': 4.82764196395874, 'learning_rate': 3.061541720487633e-06}[Rank 3] Trainer log: {'loss': 0.6169, 'grad_norm': 4.82764196395874, 'learning_rate': 3.061541720487633e-06} [Rank 2] Trainer log: {'loss': 0.6169, 'grad_norm': 4.82764196395874, 'learning_rate': 3.061541720487633e-06} [Rank 1] Trainer log: {'loss': 0.6169, 'grad_norm': 4.82764196395874, 'learning_rate': 3.061541720487633e-06} {'loss': 0.6169, 'grad_norm': 4.82764196395874, 'learning_rate': 3.061541720487633e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.9571, 'grad_norm': 6.1469902992248535, 'learning_rate': 3.05663268738429e-06} [Rank 1] Trainer log: {'loss': 0.9571, 'grad_norm': 6.1469902992248535, 'learning_rate': 3.05663268738429e-06} [Rank 0] Trainer log: {'loss': 0.9571, 'grad_norm': 6.1469902992248535, 'learning_rate': 3.05663268738429e-06}[Rank 3] Trainer log: {'loss': 0.9571, 'grad_norm': 6.1469902992248535, 'learning_rate': 3.05663268738429e-06} {'loss': 0.9571, 'grad_norm': 6.1469902992248535, 'learning_rate': 3.05663268738429e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8226, 'grad_norm': 3.5001108646392822, 'learning_rate': 3.051726883023698e-06} [Rank 0] Trainer log: {'loss': 0.8226, 'grad_norm': 3.5001108646392822, 'learning_rate': 3.051726883023698e-06}[Rank 1] Trainer log: {'loss': 0.8226, 'grad_norm': 3.5001108646392822, 'learning_rate': 3.051726883023698e-06} [Rank 3] Trainer log: {'loss': 0.8226, 'grad_norm': 3.5001108646392822, 'learning_rate': 3.051726883023698e-06} {'loss': 0.8226, 'grad_norm': 3.5001108646392822, 'learning_rate': 3.051726883023698e-06, 'epoch': 0.76} [Rank 1] Trainer log: {'loss': 0.7868, 'grad_norm': 3.1638879776000977, 'learning_rate': 3.0468243096871175e-06}[Rank 2] Trainer log: {'loss': 0.7868, 'grad_norm': 3.1638879776000977, 'learning_rate': 3.0468243096871175e-06} [Rank 3] Trainer log: {'loss': 0.7868, 'grad_norm': 3.1638879776000977, 'learning_rate': 3.0468243096871175e-06} [Rank 0] Trainer log: {'loss': 0.7868, 'grad_norm': 3.1638879776000977, 'learning_rate': 3.0468243096871175e-06} {'loss': 0.7868, 'grad_norm': 3.1638879776000977, 'learning_rate': 3.0468243096871175e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.6895, 'grad_norm': 2.5114829540252686, 'learning_rate': 3.0419249696542964e-06}[Rank 3] Trainer log: {'loss': 0.6895, 'grad_norm': 2.5114829540252686, 'learning_rate': 3.0419249696542964e-06} [Rank 0] Trainer log: {'loss': 0.6895, 'grad_norm': 2.5114829540252686, 'learning_rate': 3.0419249696542964e-06}[Rank 1] Trainer log: {'loss': 0.6895, 'grad_norm': 2.5114829540252686, 'learning_rate': 3.0419249696542964e-06} {'loss': 0.6895, 'grad_norm': 2.5114829540252686, 'learning_rate': 3.0419249696542964e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8445, 'grad_norm': 1.9591039419174194, 'learning_rate': 3.0370288652034784e-06} [Rank 1] Trainer log: {'loss': 0.8445, 'grad_norm': 1.9591039419174194, 'learning_rate': 3.0370288652034784e-06} [Rank 3] Trainer log: {'loss': 0.8445, 'grad_norm': 1.9591039419174194, 'learning_rate': 3.0370288652034784e-06} [Rank 0] Trainer log: {'loss': 0.8445, 'grad_norm': 1.9591039419174194, 'learning_rate': 3.0370288652034784e-06} {'loss': 0.8445, 'grad_norm': 1.9591039419174194, 'learning_rate': 3.0370288652034784e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8332, 'grad_norm': 10.423879623413086, 'learning_rate': 3.0321359986114096e-06} [Rank 3] Trainer log: {'loss': 0.8332, 'grad_norm': 10.423879623413086, 'learning_rate': 3.0321359986114096e-06} [Rank 0] Trainer log: {'loss': 0.8332, 'grad_norm': 10.423879623413086, 'learning_rate': 3.0321359986114096e-06}[Rank 1] Trainer log: {'loss': 0.8332, 'grad_norm': 10.423879623413086, 'learning_rate': 3.0321359986114096e-06} {'loss': 0.8332, 'grad_norm': 10.423879623413086, 'learning_rate': 3.0321359986114096e-06, 'epoch': 0.76} [Rank 1] Trainer log: {'loss': 0.7444, 'grad_norm': 18.488853454589844, 'learning_rate': 3.027246372153331e-06}[Rank 2] Trainer log: {'loss': 0.7444, 'grad_norm': 18.488853454589844, 'learning_rate': 3.027246372153331e-06}[Rank 3] Trainer log: {'loss': 0.7444, 'grad_norm': 18.488853454589844, 'learning_rate': 3.027246372153331e-06} [Rank 0] Trainer log: {'loss': 0.7444, 'grad_norm': 18.488853454589844, 'learning_rate': 3.027246372153331e-06} {'loss': 0.7444, 'grad_norm': 18.488853454589844, 'learning_rate': 3.027246372153331e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8459, 'grad_norm': 5.121431827545166, 'learning_rate': 3.0223599881029664e-06}[Rank 3] Trainer log: {'loss': 0.8459, 'grad_norm': 5.121431827545166, 'learning_rate': 3.0223599881029664e-06}[Rank 0] Trainer log: {'loss': 0.8459, 'grad_norm': 5.121431827545166, 'learning_rate': 3.0223599881029664e-06} [Rank 1] Trainer log: {'loss': 0.8459, 'grad_norm': 5.121431827545166, 'learning_rate': 3.0223599881029664e-06} {'loss': 0.8459, 'grad_norm': 5.121431827545166, 'learning_rate': 3.0223599881029664e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.7275, 'grad_norm': 3.654749870300293, 'learning_rate': 3.0174768487325435e-06} [Rank 3] Trainer log: {'loss': 0.7275, 'grad_norm': 3.654749870300293, 'learning_rate': 3.0174768487325435e-06} [Rank 1] Trainer log: {'loss': 0.7275, 'grad_norm': 3.654749870300293, 'learning_rate': 3.0174768487325435e-06} [Rank 0] Trainer log: {'loss': 0.7275, 'grad_norm': 3.654749870300293, 'learning_rate': 3.0174768487325435e-06} {'loss': 0.7275, 'grad_norm': 3.654749870300293, 'learning_rate': 3.0174768487325435e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.7466, 'grad_norm': 5.034124851226807, 'learning_rate': 3.0125969563127743e-06}[Rank 3] Trainer log: {'loss': 0.7466, 'grad_norm': 5.034124851226807, 'learning_rate': 3.0125969563127743e-06}[Rank 0] Trainer log: {'loss': 0.7466, 'grad_norm': 5.034124851226807, 'learning_rate': 3.0125969563127743e-06} [Rank 1] Trainer log: {'loss': 0.7466, 'grad_norm': 5.034124851226807, 'learning_rate': 3.0125969563127743e-06} {'loss': 0.7466, 'grad_norm': 5.034124851226807, 'learning_rate': 3.0125969563127743e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.7917, 'grad_norm': 4.642516613006592, 'learning_rate': 3.0077203131128607e-06}[Rank 3] Trainer log: {'loss': 0.7917, 'grad_norm': 4.642516613006592, 'learning_rate': 3.0077203131128607e-06} [Rank 1] Trainer log: {'loss': 0.7917, 'grad_norm': 4.642516613006592, 'learning_rate': 3.0077203131128607e-06} [Rank 0] Trainer log: {'loss': 0.7917, 'grad_norm': 4.642516613006592, 'learning_rate': 3.0077203131128607e-06} {'loss': 0.7917, 'grad_norm': 4.642516613006592, 'learning_rate': 3.0077203131128607e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.7357, 'grad_norm': 7.357348442077637, 'learning_rate': 3.0028469214005006e-06}[Rank 1] Trainer log: {'loss': 0.7357, 'grad_norm': 7.357348442077637, 'learning_rate': 3.0028469214005006e-06} [Rank 3] Trainer log: {'loss': 0.7357, 'grad_norm': 7.357348442077637, 'learning_rate': 3.0028469214005006e-06} [Rank 0] Trainer log: {'loss': 0.7357, 'grad_norm': 7.357348442077637, 'learning_rate': 3.0028469214005006e-06} {'loss': 0.7357, 'grad_norm': 7.357348442077637, 'learning_rate': 3.0028469214005006e-06, 'epoch': 0.76} [Rank 3] Trainer log: {'loss': 0.7797, 'grad_norm': 7.813817501068115, 'learning_rate': 2.9979767834418726e-06} [Rank 0] Trainer log: {'loss': 0.7797, 'grad_norm': 7.813817501068115, 'learning_rate': 2.9979767834418726e-06}[Rank 2] Trainer log: {'loss': 0.7797, 'grad_norm': 7.813817501068115, 'learning_rate': 2.9979767834418726e-06}[Rank 1] Trainer log: {'loss': 0.7797, 'grad_norm': 7.813817501068115, 'learning_rate': 2.9979767834418726e-06} {'loss': 0.7797, 'grad_norm': 7.813817501068115, 'learning_rate': 2.9979767834418726e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.6141, 'grad_norm': 14.517114639282227, 'learning_rate': 2.9931099015016417e-06}[Rank 1] Trainer log: {'loss': 0.6141, 'grad_norm': 14.517114639282227, 'learning_rate': 2.9931099015016417e-06} [Rank 3] Trainer log: {'loss': 0.6141, 'grad_norm': 14.517114639282227, 'learning_rate': 2.9931099015016417e-06} [Rank 0] Trainer log: {'loss': 0.6141, 'grad_norm': 14.517114639282227, 'learning_rate': 2.9931099015016417e-06} {'loss': 0.6141, 'grad_norm': 14.517114639282227, 'learning_rate': 2.9931099015016417e-06, 'epoch': 0.76} [Rank 0] Trainer log: {'loss': 0.7142, 'grad_norm': 3.5573127269744873, 'learning_rate': 2.9882462778429643e-06}[Rank 1] Trainer log: {'loss': 0.7142, 'grad_norm': 3.5573127269744873, 'learning_rate': 2.9882462778429643e-06}[Rank 3] Trainer log: {'loss': 0.7142, 'grad_norm': 3.5573127269744873, 'learning_rate': 2.9882462778429643e-06} [Rank 2] Trainer log: {'loss': 0.7142, 'grad_norm': 3.5573127269744873, 'learning_rate': 2.9882462778429643e-06} {'loss': 0.7142, 'grad_norm': 3.5573127269744873, 'learning_rate': 2.9882462778429643e-06, 'epoch': 0.76} [Rank 1] Trainer log: {'loss': 0.9442, 'grad_norm': 10.148120880126953, 'learning_rate': 2.9833859147274836e-06} [Rank 2] Trainer log: {'loss': 0.9442, 'grad_norm': 10.148120880126953, 'learning_rate': 2.9833859147274836e-06} [Rank 0] Trainer log: {'loss': 0.9442, 'grad_norm': 10.148120880126953, 'learning_rate': 2.9833859147274836e-06}[Rank 3] Trainer log: {'loss': 0.9442, 'grad_norm': 10.148120880126953, 'learning_rate': 2.9833859147274836e-06} {'loss': 0.9442, 'grad_norm': 10.148120880126953, 'learning_rate': 2.9833859147274836e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.9468, 'grad_norm': 6.344007968902588, 'learning_rate': 2.978528814415319e-06}[Rank 3] Trainer log: {'loss': 0.9468, 'grad_norm': 6.344007968902588, 'learning_rate': 2.978528814415319e-06} [Rank 0] Trainer log: {'loss': 0.9468, 'grad_norm': 6.344007968902588, 'learning_rate': 2.978528814415319e-06} [Rank 1] Trainer log: {'loss': 0.9468, 'grad_norm': 6.344007968902588, 'learning_rate': 2.978528814415319e-06} {'loss': 0.9468, 'grad_norm': 6.344007968902588, 'learning_rate': 2.978528814415319e-06, 'epoch': 0.76} [Rank 0] Trainer log: {'loss': 0.6807, 'grad_norm': 5.173394203186035, 'learning_rate': 2.973674979165072e-06}[Rank 3] Trainer log: {'loss': 0.6807, 'grad_norm': 5.173394203186035, 'learning_rate': 2.973674979165072e-06}[Rank 2] Trainer log: {'loss': 0.6807, 'grad_norm': 5.173394203186035, 'learning_rate': 2.973674979165072e-06} [Rank 1] Trainer log: {'loss': 0.6807, 'grad_norm': 5.173394203186035, 'learning_rate': 2.973674979165072e-06} {'loss': 0.6807, 'grad_norm': 5.173394203186035, 'learning_rate': 2.973674979165072e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.7694, 'grad_norm': 11.239935874938965, 'learning_rate': 2.968824411233837e-06}[Rank 3] Trainer log: {'loss': 0.7694, 'grad_norm': 11.239935874938965, 'learning_rate': 2.968824411233837e-06} [Rank 1] Trainer log: {'loss': 0.7694, 'grad_norm': 11.239935874938965, 'learning_rate': 2.968824411233837e-06} [Rank 0] Trainer log: {'loss': 0.7694, 'grad_norm': 11.239935874938965, 'learning_rate': 2.968824411233837e-06} {'loss': 0.7694, 'grad_norm': 11.239935874938965, 'learning_rate': 2.968824411233837e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.9798, 'grad_norm': 7.18096399307251, 'learning_rate': 2.9639771128771787e-06} [Rank 1] Trainer log: {'loss': 0.9798, 'grad_norm': 7.18096399307251, 'learning_rate': 2.9639771128771787e-06} [Rank 3] Trainer log: {'loss': 0.9798, 'grad_norm': 7.18096399307251, 'learning_rate': 2.9639771128771787e-06} [Rank 0] Trainer log: {'loss': 0.9798, 'grad_norm': 7.18096399307251, 'learning_rate': 2.9639771128771787e-06} {'loss': 0.9798, 'grad_norm': 7.18096399307251, 'learning_rate': 2.9639771128771787e-06, 'epoch': 0.76} [Rank 0] Trainer log: {'loss': 0.939, 'grad_norm': 7.09716796875, 'learning_rate': 2.9591330863491395e-06}[Rank 1] Trainer log: {'loss': 0.939, 'grad_norm': 7.09716796875, 'learning_rate': 2.9591330863491395e-06}[Rank 2] Trainer log: {'loss': 0.939, 'grad_norm': 7.09716796875, 'learning_rate': 2.9591330863491395e-06} [Rank 3] Trainer log: {'loss': 0.939, 'grad_norm': 7.09716796875, 'learning_rate': 2.9591330863491395e-06} {'loss': 0.939, 'grad_norm': 7.09716796875, 'learning_rate': 2.9591330863491395e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.6507, 'grad_norm': 10.313652992248535, 'learning_rate': 2.9542923339022543e-06} [Rank 0] Trainer log: {'loss': 0.6507, 'grad_norm': 10.313652992248535, 'learning_rate': 2.9542923339022543e-06}[Rank 3] Trainer log: {'loss': 0.6507, 'grad_norm': 10.313652992248535, 'learning_rate': 2.9542923339022543e-06} [Rank 1] Trainer log: {'loss': 0.6507, 'grad_norm': 10.313652992248535, 'learning_rate': 2.9542923339022543e-06} {'loss': 0.6507, 'grad_norm': 10.313652992248535, 'learning_rate': 2.9542923339022543e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.866, 'grad_norm': 7.1059346199035645, 'learning_rate': 2.9494548577875195e-06} [Rank 3] Trainer log: {'loss': 0.866, 'grad_norm': 7.1059346199035645, 'learning_rate': 2.9494548577875195e-06} [Rank 1] Trainer log: {'loss': 0.866, 'grad_norm': 7.1059346199035645, 'learning_rate': 2.9494548577875195e-06} [Rank 0] Trainer log: {'loss': 0.866, 'grad_norm': 7.1059346199035645, 'learning_rate': 2.9494548577875195e-06} {'loss': 0.866, 'grad_norm': 7.1059346199035645, 'learning_rate': 2.9494548577875195e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8177, 'grad_norm': 5.754032611846924, 'learning_rate': 2.944620660254418e-06}[Rank 3] Trainer log: {'loss': 0.8177, 'grad_norm': 5.754032611846924, 'learning_rate': 2.944620660254418e-06} [Rank 0] Trainer log: {'loss': 0.8177, 'grad_norm': 5.754032611846924, 'learning_rate': 2.944620660254418e-06} [Rank 1] Trainer log: {'loss': 0.8177, 'grad_norm': 5.754032611846924, 'learning_rate': 2.944620660254418e-06} {'loss': 0.8177, 'grad_norm': 5.754032611846924, 'learning_rate': 2.944620660254418e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.9526, 'grad_norm': 1.6467398405075073, 'learning_rate': 2.939789743550908e-06} [Rank 1] Trainer log: {'loss': 0.9526, 'grad_norm': 1.6467398405075073, 'learning_rate': 2.939789743550908e-06} [Rank 0] Trainer log: {'loss': 0.9526, 'grad_norm': 1.6467398405075073, 'learning_rate': 2.939789743550908e-06} [Rank 3] Trainer log: {'loss': 0.9526, 'grad_norm': 1.6467398405075073, 'learning_rate': 2.939789743550908e-06} {'loss': 0.9526, 'grad_norm': 1.6467398405075073, 'learning_rate': 2.939789743550908e-06, 'epoch': 0.76} [Rank 3] Trainer log: {'loss': 0.7393, 'grad_norm': 7.032866954803467, 'learning_rate': 2.934962109923417e-06}[Rank 2] Trainer log: {'loss': 0.7393, 'grad_norm': 7.032866954803467, 'learning_rate': 2.934962109923417e-06} [Rank 0] Trainer log: {'loss': 0.7393, 'grad_norm': 7.032866954803467, 'learning_rate': 2.934962109923417e-06} [Rank 1] Trainer log: {'loss': 0.7393, 'grad_norm': 7.032866954803467, 'learning_rate': 2.934962109923417e-06} {'loss': 0.7393, 'grad_norm': 7.032866954803467, 'learning_rate': 2.934962109923417e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.878, 'grad_norm': 3.0653624534606934, 'learning_rate': 2.9301377616168435e-06}[Rank 3] Trainer log: {'loss': 0.878, 'grad_norm': 3.0653624534606934, 'learning_rate': 2.9301377616168435e-06} [Rank 0] Trainer log: {'loss': 0.878, 'grad_norm': 3.0653624534606934, 'learning_rate': 2.9301377616168435e-06}[Rank 1] Trainer log: {'loss': 0.878, 'grad_norm': 3.0653624534606934, 'learning_rate': 2.9301377616168435e-06} {'loss': 0.878, 'grad_norm': 3.0653624534606934, 'learning_rate': 2.9301377616168435e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.7865, 'grad_norm': 5.935242652893066, 'learning_rate': 2.925316700874572e-06} [Rank 1] Trainer log: {'loss': 0.7865, 'grad_norm': 5.935242652893066, 'learning_rate': 2.925316700874572e-06} [Rank 0] Trainer log: {'loss': 0.7865, 'grad_norm': 5.935242652893066, 'learning_rate': 2.925316700874572e-06}[Rank 3] Trainer log: {'loss': 0.7865, 'grad_norm': 5.935242652893066, 'learning_rate': 2.925316700874572e-06} {'loss': 0.7865, 'grad_norm': 5.935242652893066, 'learning_rate': 2.925316700874572e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8552, 'grad_norm': 3.7229254245758057, 'learning_rate': 2.9204989299384435e-06}[Rank 0] Trainer log: {'loss': 0.8552, 'grad_norm': 3.7229254245758057, 'learning_rate': 2.9204989299384435e-06}[Rank 3] Trainer log: {'loss': 0.8552, 'grad_norm': 3.7229254245758057, 'learning_rate': 2.9204989299384435e-06} [Rank 1] Trainer log: {'loss': 0.8552, 'grad_norm': 3.7229254245758057, 'learning_rate': 2.9204989299384435e-06} {'loss': 0.8552, 'grad_norm': 3.7229254245758057, 'learning_rate': 2.9204989299384435e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8667, 'grad_norm': 6.495620250701904, 'learning_rate': 2.9156844510487725e-06}[Rank 0] Trainer log: {'loss': 0.8667, 'grad_norm': 6.495620250701904, 'learning_rate': 2.9156844510487725e-06}[Rank 3] Trainer log: {'loss': 0.8667, 'grad_norm': 6.495620250701904, 'learning_rate': 2.9156844510487725e-06} [Rank 1] Trainer log: {'loss': 0.8667, 'grad_norm': 6.495620250701904, 'learning_rate': 2.9156844510487725e-06} {'loss': 0.8667, 'grad_norm': 6.495620250701904, 'learning_rate': 2.9156844510487725e-06, 'epoch': 0.76} [Rank 0] Trainer log: {'loss': 0.7935, 'grad_norm': 3.1274662017822266, 'learning_rate': 2.9108732664443507e-06}[Rank 2] Trainer log: {'loss': 0.7935, 'grad_norm': 3.1274662017822266, 'learning_rate': 2.9108732664443507e-06}[Rank 3] Trainer log: {'loss': 0.7935, 'grad_norm': 3.1274662017822266, 'learning_rate': 2.9108732664443507e-06} [Rank 1] Trainer log: {'loss': 0.7935, 'grad_norm': 3.1274662017822266, 'learning_rate': 2.9108732664443507e-06} {'loss': 0.7935, 'grad_norm': 3.1274662017822266, 'learning_rate': 2.9108732664443507e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.7535, 'grad_norm': 5.720362663269043, 'learning_rate': 2.906065378362428e-06}[Rank 0] Trainer log: {'loss': 0.7535, 'grad_norm': 5.720362663269043, 'learning_rate': 2.906065378362428e-06}[Rank 1] Trainer log: {'loss': 0.7535, 'grad_norm': 5.720362663269043, 'learning_rate': 2.906065378362428e-06} [Rank 3] Trainer log: {'loss': 0.7535, 'grad_norm': 5.720362663269043, 'learning_rate': 2.906065378362428e-06} {'loss': 0.7535, 'grad_norm': 5.720362663269043, 'learning_rate': 2.906065378362428e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 1.0346, 'grad_norm': 2.3970894813537598, 'learning_rate': 2.901260789038727e-06} [Rank 0] Trainer log: {'loss': 1.0346, 'grad_norm': 2.3970894813537598, 'learning_rate': 2.901260789038727e-06}[Rank 3] Trainer log: {'loss': 1.0346, 'grad_norm': 2.3970894813537598, 'learning_rate': 2.901260789038727e-06} [Rank 1] Trainer log: {'loss': 1.0346, 'grad_norm': 2.3970894813537598, 'learning_rate': 2.901260789038727e-06} {'loss': 1.0346, 'grad_norm': 2.3970894813537598, 'learning_rate': 2.901260789038727e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8246, 'grad_norm': 9.615467071533203, 'learning_rate': 2.8964595007074383e-06}[Rank 3] Trainer log: {'loss': 0.8246, 'grad_norm': 9.615467071533203, 'learning_rate': 2.8964595007074383e-06} [Rank 0] Trainer log: {'loss': 0.8246, 'grad_norm': 9.615467071533203, 'learning_rate': 2.8964595007074383e-06} [Rank 1] Trainer log: {'loss': 0.8246, 'grad_norm': 9.615467071533203, 'learning_rate': 2.8964595007074383e-06} {'loss': 0.8246, 'grad_norm': 9.615467071533203, 'learning_rate': 2.8964595007074383e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8129, 'grad_norm': 2.8437843322753906, 'learning_rate': 2.8916615156012105e-06}[Rank 0] Trainer log: {'loss': 0.8129, 'grad_norm': 2.8437843322753906, 'learning_rate': 2.8916615156012105e-06} [Rank 3] Trainer log: {'loss': 0.8129, 'grad_norm': 2.8437843322753906, 'learning_rate': 2.8916615156012105e-06} [Rank 1] Trainer log: {'loss': 0.8129, 'grad_norm': 2.8437843322753906, 'learning_rate': 2.8916615156012105e-06} {'loss': 0.8129, 'grad_norm': 2.8437843322753906, 'learning_rate': 2.8916615156012105e-06, 'epoch': 0.76} [Rank 3] Trainer log: {'loss': 0.8146, 'grad_norm': 3.4146149158477783, 'learning_rate': 2.8868668359511586e-06} [Rank 0] Trainer log: {'loss': 0.8146, 'grad_norm': 3.4146149158477783, 'learning_rate': 2.8868668359511586e-06}[Rank 1] Trainer log: {'loss': 0.8146, 'grad_norm': 3.4146149158477783, 'learning_rate': 2.8868668359511586e-06} [Rank 2] Trainer log: {'loss': 0.8146, 'grad_norm': 3.4146149158477783, 'learning_rate': 2.8868668359511586e-06} {'loss': 0.8146, 'grad_norm': 3.4146149158477783, 'learning_rate': 2.8868668359511586e-06, 'epoch': 0.76} [Rank 0] Trainer log: {'loss': 0.7142, 'grad_norm': 5.356014251708984, 'learning_rate': 2.8820754639868674e-06}[Rank 1] Trainer log: {'loss': 0.7142, 'grad_norm': 5.356014251708984, 'learning_rate': 2.8820754639868674e-06} [Rank 2] Trainer log: {'loss': 0.7142, 'grad_norm': 5.356014251708984, 'learning_rate': 2.8820754639868674e-06} [Rank 3] Trainer log: {'loss': 0.7142, 'grad_norm': 5.356014251708984, 'learning_rate': 2.8820754639868674e-06} {'loss': 0.7142, 'grad_norm': 5.356014251708984, 'learning_rate': 2.8820754639868674e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.8042, 'grad_norm': 2.388598918914795, 'learning_rate': 2.877287401936374e-06}[Rank 0] Trainer log: {'loss': 0.8042, 'grad_norm': 2.388598918914795, 'learning_rate': 2.877287401936374e-06}[Rank 3] Trainer log: {'loss': 0.8042, 'grad_norm': 2.388598918914795, 'learning_rate': 2.877287401936374e-06} [Rank 1] Trainer log: {'loss': 0.8042, 'grad_norm': 2.388598918914795, 'learning_rate': 2.877287401936374e-06} {'loss': 0.8042, 'grad_norm': 2.388598918914795, 'learning_rate': 2.877287401936374e-06, 'epoch': 0.76} [Rank 2] Trainer log: {'loss': 0.7917, 'grad_norm': 4.4528608322143555, 'learning_rate': 2.872502652026178e-06} [Rank 0] Trainer log: {'loss': 0.7917, 'grad_norm': 4.4528608322143555, 'learning_rate': 2.872502652026178e-06}[Rank 3] Trainer log: {'loss': 0.7917, 'grad_norm': 4.4528608322143555, 'learning_rate': 2.872502652026178e-06} [Rank 1] Trainer log: {'loss': 0.7917, 'grad_norm': 4.4528608322143555, 'learning_rate': 2.872502652026178e-06} {'loss': 0.7917, 'grad_norm': 4.4528608322143555, 'learning_rate': 2.872502652026178e-06, 'epoch': 0.76} [Rank 3] Trainer log: {'loss': 0.7603, 'grad_norm': 5.620354175567627, 'learning_rate': 2.8677212164812464e-06} [Rank 0] Trainer log: {'loss': 0.7603, 'grad_norm': 5.620354175567627, 'learning_rate': 2.8677212164812464e-06}[Rank 2] Trainer log: {'loss': 0.7603, 'grad_norm': 5.620354175567627, 'learning_rate': 2.8677212164812464e-06} [Rank 1] Trainer log: {'loss': 0.7603, 'grad_norm': 5.620354175567627, 'learning_rate': 2.8677212164812464e-06} {'loss': 0.7603, 'grad_norm': 5.620354175567627, 'learning_rate': 2.8677212164812464e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.9528, 'grad_norm': 2.9935309886932373, 'learning_rate': 2.862943097524994e-06}[Rank 3] Trainer log: {'loss': 0.9528, 'grad_norm': 2.9935309886932373, 'learning_rate': 2.862943097524994e-06} [Rank 0] Trainer log: {'loss': 0.9528, 'grad_norm': 2.9935309886932373, 'learning_rate': 2.862943097524994e-06} [Rank 1] Trainer log: {'loss': 0.9528, 'grad_norm': 2.9935309886932373, 'learning_rate': 2.862943097524994e-06} {'loss': 0.9528, 'grad_norm': 2.9935309886932373, 'learning_rate': 2.862943097524994e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.8517, 'grad_norm': 4.037564754486084, 'learning_rate': 2.8581682973793057e-06}[Rank 3] Trainer log: {'loss': 0.8517, 'grad_norm': 4.037564754486084, 'learning_rate': 2.8581682973793057e-06} [Rank 0] Trainer log: {'loss': 0.8517, 'grad_norm': 4.037564754486084, 'learning_rate': 2.8581682973793057e-06} [Rank 1] Trainer log: {'loss': 0.8517, 'grad_norm': 4.037564754486084, 'learning_rate': 2.8581682973793057e-06} {'loss': 0.8517, 'grad_norm': 4.037564754486084, 'learning_rate': 2.8581682973793057e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.8465, 'grad_norm': 6.498946666717529, 'learning_rate': 2.85339681826451e-06} [Rank 3] Trainer log: {'loss': 0.8465, 'grad_norm': 6.498946666717529, 'learning_rate': 2.85339681826451e-06} [Rank 1] Trainer log: {'loss': 0.8465, 'grad_norm': 6.498946666717529, 'learning_rate': 2.85339681826451e-06} [Rank 0] Trainer log: {'loss': 0.8465, 'grad_norm': 6.498946666717529, 'learning_rate': 2.85339681826451e-06} {'loss': 0.8465, 'grad_norm': 6.498946666717529, 'learning_rate': 2.85339681826451e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 0.7273, 'grad_norm': 3.903646945953369, 'learning_rate': 2.848628662399404e-06}[Rank 2] Trainer log: {'loss': 0.7273, 'grad_norm': 3.903646945953369, 'learning_rate': 2.848628662399404e-06}[Rank 1] Trainer log: {'loss': 0.7273, 'grad_norm': 3.903646945953369, 'learning_rate': 2.848628662399404e-06} [Rank 0] Trainer log: {'loss': 0.7273, 'grad_norm': 3.903646945953369, 'learning_rate': 2.848628662399404e-06} {'loss': 0.7273, 'grad_norm': 3.903646945953369, 'learning_rate': 2.848628662399404e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.5495, 'grad_norm': 6.710810661315918, 'learning_rate': 2.843863832001229e-06}[Rank 2] Trainer log: {'loss': 0.5495, 'grad_norm': 6.710810661315918, 'learning_rate': 2.843863832001229e-06}[Rank 0] Trainer log: {'loss': 0.5495, 'grad_norm': 6.710810661315918, 'learning_rate': 2.843863832001229e-06} [Rank 3] Trainer log: {'loss': 0.5495, 'grad_norm': 6.710810661315918, 'learning_rate': 2.843863832001229e-06} {'loss': 0.5495, 'grad_norm': 6.710810661315918, 'learning_rate': 2.843863832001229e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.8228, 'grad_norm': 2.8252060413360596, 'learning_rate': 2.839102329285682e-06}[Rank 0] Trainer log: {'loss': 0.8228, 'grad_norm': 2.8252060413360596, 'learning_rate': 2.839102329285682e-06} [Rank 1] Trainer log: {'loss': 0.8228, 'grad_norm': 2.8252060413360596, 'learning_rate': 2.839102329285682e-06} [Rank 3] Trainer log: {'loss': 0.8228, 'grad_norm': 2.8252060413360596, 'learning_rate': 2.839102329285682e-06} {'loss': 0.8228, 'grad_norm': 2.8252060413360596, 'learning_rate': 2.839102329285682e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.6806, 'grad_norm': 7.783218860626221, 'learning_rate': 2.8343441564669195e-06}[Rank 0] Trainer log: {'loss': 0.6806, 'grad_norm': 7.783218860626221, 'learning_rate': 2.8343441564669195e-06}[Rank 3] Trainer log: {'loss': 0.6806, 'grad_norm': 7.783218860626221, 'learning_rate': 2.8343441564669195e-06} [Rank 1] Trainer log: {'loss': 0.6806, 'grad_norm': 7.783218860626221, 'learning_rate': 2.8343441564669195e-06} {'loss': 0.6806, 'grad_norm': 7.783218860626221, 'learning_rate': 2.8343441564669195e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.9841, 'grad_norm': 2.104217767715454, 'learning_rate': 2.8295893157575438e-06}[Rank 0] Trainer log: {'loss': 0.9841, 'grad_norm': 2.104217767715454, 'learning_rate': 2.8295893157575438e-06} [Rank 3] Trainer log: {'loss': 0.9841, 'grad_norm': 2.104217767715454, 'learning_rate': 2.8295893157575438e-06} [Rank 1] Trainer log: {'loss': 0.9841, 'grad_norm': 2.104217767715454, 'learning_rate': 2.8295893157575438e-06} {'loss': 0.9841, 'grad_norm': 2.104217767715454, 'learning_rate': 2.8295893157575438e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 1.0265, 'grad_norm': 2.612687349319458, 'learning_rate': 2.8248378093686023e-06}[Rank 3] Trainer log: {'loss': 1.0265, 'grad_norm': 2.612687349319458, 'learning_rate': 2.8248378093686023e-06} [Rank 0] Trainer log: {'loss': 1.0265, 'grad_norm': 2.612687349319458, 'learning_rate': 2.8248378093686023e-06} [Rank 1] Trainer log: {'loss': 1.0265, 'grad_norm': 2.612687349319458, 'learning_rate': 2.8248378093686023e-06} {'loss': 1.0265, 'grad_norm': 2.612687349319458, 'learning_rate': 2.8248378093686023e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.5783, 'grad_norm': 5.352138042449951, 'learning_rate': 2.8200896395096035e-06} [Rank 3] Trainer log: {'loss': 0.5783, 'grad_norm': 5.352138042449951, 'learning_rate': 2.8200896395096035e-06}[Rank 0] Trainer log: {'loss': 0.5783, 'grad_norm': 5.352138042449951, 'learning_rate': 2.8200896395096035e-06} [Rank 1] Trainer log: {'loss': 0.5783, 'grad_norm': 5.352138042449951, 'learning_rate': 2.8200896395096035e-06} {'loss': 0.5783, 'grad_norm': 5.352138042449951, 'learning_rate': 2.8200896395096035e-06, 'epoch': 0.77} [Rank 0] Trainer log: {'loss': 0.8848, 'grad_norm': 4.215061187744141, 'learning_rate': 2.8153448083885017e-06}[Rank 2] Trainer log: {'loss': 0.8848, 'grad_norm': 4.215061187744141, 'learning_rate': 2.8153448083885017e-06} [Rank 1] Trainer log: {'loss': 0.8848, 'grad_norm': 4.215061187744141, 'learning_rate': 2.8153448083885017e-06} [Rank 3] Trainer log: {'loss': 0.8848, 'grad_norm': 4.215061187744141, 'learning_rate': 2.8153448083885017e-06} {'loss': 0.8848, 'grad_norm': 4.215061187744141, 'learning_rate': 2.8153448083885017e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 1.0473, 'grad_norm': 4.155832767486572, 'learning_rate': 2.8106033182116886e-06}[Rank 3] Trainer log: {'loss': 1.0473, 'grad_norm': 4.155832767486572, 'learning_rate': 2.8106033182116886e-06} [Rank 0] Trainer log: {'loss': 1.0473, 'grad_norm': 4.155832767486572, 'learning_rate': 2.8106033182116886e-06} [Rank 1] Trainer log: {'loss': 1.0473, 'grad_norm': 4.155832767486572, 'learning_rate': 2.8106033182116886e-06} {'loss': 1.0473, 'grad_norm': 4.155832767486572, 'learning_rate': 2.8106033182116886e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 0.8817, 'grad_norm': 6.497180461883545, 'learning_rate': 2.805865171184018e-06}[Rank 2] Trainer log: {'loss': 0.8817, 'grad_norm': 6.497180461883545, 'learning_rate': 2.805865171184018e-06} [Rank 1] Trainer log: {'loss': 0.8817, 'grad_norm': 6.497180461883545, 'learning_rate': 2.805865171184018e-06} [Rank 0] Trainer log: {'loss': 0.8817, 'grad_norm': 6.497180461883545, 'learning_rate': 2.805865171184018e-06} {'loss': 0.8817, 'grad_norm': 6.497180461883545, 'learning_rate': 2.805865171184018e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.6736, 'grad_norm': 19.713607788085938, 'learning_rate': 2.8011303695087755e-06}[Rank 0] Trainer log: {'loss': 0.6736, 'grad_norm': 19.713607788085938, 'learning_rate': 2.8011303695087755e-06} [Rank 3] Trainer log: {'loss': 0.6736, 'grad_norm': 19.713607788085938, 'learning_rate': 2.8011303695087755e-06} [Rank 1] Trainer log: {'loss': 0.6736, 'grad_norm': 19.713607788085938, 'learning_rate': 2.8011303695087755e-06} {'loss': 0.6736, 'grad_norm': 19.713607788085938, 'learning_rate': 2.8011303695087755e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 0.8756, 'grad_norm': 4.505522727966309, 'learning_rate': 2.7963989153876938e-06} [Rank 0] Trainer log: {'loss': 0.8756, 'grad_norm': 4.505522727966309, 'learning_rate': 2.7963989153876938e-06}[Rank 1] Trainer log: {'loss': 0.8756, 'grad_norm': 4.505522727966309, 'learning_rate': 2.7963989153876938e-06} [Rank 2] Trainer log: {'loss': 0.8756, 'grad_norm': 4.505522727966309, 'learning_rate': 2.7963989153876938e-06} {'loss': 0.8756, 'grad_norm': 4.505522727966309, 'learning_rate': 2.7963989153876938e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 0.8706, 'grad_norm': 4.591093063354492, 'learning_rate': 2.791670811020959e-06} [Rank 2] Trainer log: {'loss': 0.8706, 'grad_norm': 4.591093063354492, 'learning_rate': 2.791670811020959e-06} [Rank 1] Trainer log: {'loss': 0.8706, 'grad_norm': 4.591093063354492, 'learning_rate': 2.791670811020959e-06} [Rank 0] Trainer log: {'loss': 0.8706, 'grad_norm': 4.591093063354492, 'learning_rate': 2.791670811020959e-06} {'loss': 0.8706, 'grad_norm': 4.591093063354492, 'learning_rate': 2.791670811020959e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.7938, 'grad_norm': 2.898714303970337, 'learning_rate': 2.786946058607187e-06} [Rank 0] Trainer log: {'loss': 0.7938, 'grad_norm': 2.898714303970337, 'learning_rate': 2.786946058607187e-06}[Rank 1] Trainer log: {'loss': 0.7938, 'grad_norm': 2.898714303970337, 'learning_rate': 2.786946058607187e-06} [Rank 3] Trainer log: {'loss': 0.7938, 'grad_norm': 2.898714303970337, 'learning_rate': 2.786946058607187e-06} {'loss': 0.7938, 'grad_norm': 2.898714303970337, 'learning_rate': 2.786946058607187e-06, 'epoch': 0.77} [Rank 0] Trainer log: {'loss': 0.8801, 'grad_norm': 4.647657871246338, 'learning_rate': 2.7822246603434387e-06}[Rank 2] Trainer log: {'loss': 0.8801, 'grad_norm': 4.647657871246338, 'learning_rate': 2.7822246603434387e-06}[Rank 3] Trainer log: {'loss': 0.8801, 'grad_norm': 4.647657871246338, 'learning_rate': 2.7822246603434387e-06} [Rank 1] Trainer log: {'loss': 0.8801, 'grad_norm': 4.647657871246338, 'learning_rate': 2.7822246603434387e-06} {'loss': 0.8801, 'grad_norm': 4.647657871246338, 'learning_rate': 2.7822246603434387e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.8144, 'grad_norm': 5.917095184326172, 'learning_rate': 2.7775066184252196e-06} [Rank 1] Trainer log: {'loss': 0.8144, 'grad_norm': 5.917095184326172, 'learning_rate': 2.7775066184252196e-06} [Rank 0] Trainer log: {'loss': 0.8144, 'grad_norm': 5.917095184326172, 'learning_rate': 2.7775066184252196e-06}[Rank 3] Trainer log: {'loss': 0.8144, 'grad_norm': 5.917095184326172, 'learning_rate': 2.7775066184252196e-06} {'loss': 0.8144, 'grad_norm': 5.917095184326172, 'learning_rate': 2.7775066184252196e-06, 'epoch': 0.77} [Rank 0] Trainer log: {'loss': 0.6292, 'grad_norm': 7.351046085357666, 'learning_rate': 2.772791935046473e-06}[Rank 2] Trainer log: {'loss': 0.6292, 'grad_norm': 7.351046085357666, 'learning_rate': 2.772791935046473e-06} [Rank 3] Trainer log: {'loss': 0.6292, 'grad_norm': 7.351046085357666, 'learning_rate': 2.772791935046473e-06} [Rank 1] Trainer log: {'loss': 0.6292, 'grad_norm': 7.351046085357666, 'learning_rate': 2.772791935046473e-06} {'loss': 0.6292, 'grad_norm': 7.351046085357666, 'learning_rate': 2.772791935046473e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.7558, 'grad_norm': 3.7266056537628174, 'learning_rate': 2.768080612399574e-06}[Rank 3] Trainer log: {'loss': 0.7558, 'grad_norm': 3.7266056537628174, 'learning_rate': 2.768080612399574e-06} [Rank 1] Trainer log: {'loss': 0.7558, 'grad_norm': 3.7266056537628174, 'learning_rate': 2.768080612399574e-06} [Rank 0] Trainer log: {'loss': 0.7558, 'grad_norm': 3.7266056537628174, 'learning_rate': 2.768080612399574e-06} {'loss': 0.7558, 'grad_norm': 3.7266056537628174, 'learning_rate': 2.768080612399574e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.7369, 'grad_norm': 5.054093837738037, 'learning_rate': 2.7633726526753457e-06}[Rank 0] Trainer log: {'loss': 0.7369, 'grad_norm': 5.054093837738037, 'learning_rate': 2.7633726526753457e-06} [Rank 3] Trainer log: {'loss': 0.7369, 'grad_norm': 5.054093837738037, 'learning_rate': 2.7633726526753457e-06} [Rank 2] Trainer log: {'loss': 0.7369, 'grad_norm': 5.054093837738037, 'learning_rate': 2.7633726526753457e-06} {'loss': 0.7369, 'grad_norm': 5.054093837738037, 'learning_rate': 2.7633726526753457e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.6825, 'grad_norm': 4.230699062347412, 'learning_rate': 2.7586680580630386e-06}[Rank 0] Trainer log: {'loss': 0.6825, 'grad_norm': 4.230699062347412, 'learning_rate': 2.7586680580630386e-06}[Rank 3] Trainer log: {'loss': 0.6825, 'grad_norm': 4.230699062347412, 'learning_rate': 2.7586680580630386e-06} [Rank 1] Trainer log: {'loss': 0.6825, 'grad_norm': 4.230699062347412, 'learning_rate': 2.7586680580630386e-06} {'loss': 0.6825, 'grad_norm': 4.230699062347412, 'learning_rate': 2.7586680580630386e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.8301, 'grad_norm': 10.317546844482422, 'learning_rate': 2.7539668307503386e-06} [Rank 1] Trainer log: {'loss': 0.8301, 'grad_norm': 10.317546844482422, 'learning_rate': 2.7539668307503386e-06} [Rank 0] Trainer log: {'loss': 0.8301, 'grad_norm': 10.317546844482422, 'learning_rate': 2.7539668307503386e-06}[Rank 3] Trainer log: {'loss': 0.8301, 'grad_norm': 10.317546844482422, 'learning_rate': 2.7539668307503386e-06} {'loss': 0.8301, 'grad_norm': 10.317546844482422, 'learning_rate': 2.7539668307503386e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 1.0061, 'grad_norm': 5.8798723220825195, 'learning_rate': 2.7492689729233737e-06}[Rank 3] Trainer log: {'loss': 1.0061, 'grad_norm': 5.8798723220825195, 'learning_rate': 2.7492689729233737e-06} [Rank 0] Trainer log: {'loss': 1.0061, 'grad_norm': 5.8798723220825195, 'learning_rate': 2.7492689729233737e-06} [Rank 1] Trainer log: {'loss': 1.0061, 'grad_norm': 5.8798723220825195, 'learning_rate': 2.7492689729233737e-06} {'loss': 1.0061, 'grad_norm': 5.8798723220825195, 'learning_rate': 2.7492689729233737e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.9168, 'grad_norm': 4.363884449005127, 'learning_rate': 2.7445744867666935e-06}[Rank 0] Trainer log: {'loss': 0.9168, 'grad_norm': 4.363884449005127, 'learning_rate': 2.7445744867666935e-06} [Rank 3] Trainer log: {'loss': 0.9168, 'grad_norm': 4.363884449005127, 'learning_rate': 2.7445744867666935e-06} {'loss': 0.9168, 'grad_norm': 4.363884449005127, 'learning_rate': 2.7445744867666935e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.9168, 'grad_norm': 4.363884449005127, 'learning_rate': 2.7445744867666935e-06} [Rank 2] Trainer log: {'loss': 0.8433, 'grad_norm': 3.5118939876556396, 'learning_rate': 2.7398833744632934e-06}[Rank 0] Trainer log: {'loss': 0.8433, 'grad_norm': 3.5118939876556396, 'learning_rate': 2.7398833744632934e-06} [Rank 3] Trainer log: {'loss': 0.8433, 'grad_norm': 3.5118939876556396, 'learning_rate': 2.7398833744632934e-06} [Rank 1] Trainer log: {'loss': 0.8433, 'grad_norm': 3.5118939876556396, 'learning_rate': 2.7398833744632934e-06} {'loss': 0.8433, 'grad_norm': 3.5118939876556396, 'learning_rate': 2.7398833744632934e-06, 'epoch': 0.77} [Rank 0] Trainer log: {'loss': 0.6868, 'grad_norm': 3.3412628173828125, 'learning_rate': 2.7351956381945865e-06}[Rank 3] Trainer log: {'loss': 0.6868, 'grad_norm': 3.3412628173828125, 'learning_rate': 2.7351956381945865e-06}[Rank 2] Trainer log: {'loss': 0.6868, 'grad_norm': 3.3412628173828125, 'learning_rate': 2.7351956381945865e-06} [Rank 1] Trainer log: {'loss': 0.6868, 'grad_norm': 3.3412628173828125, 'learning_rate': 2.7351956381945865e-06} {'loss': 0.6868, 'grad_norm': 3.3412628173828125, 'learning_rate': 2.7351956381945865e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.6926, 'grad_norm': 6.026294231414795, 'learning_rate': 2.7305112801404277e-06}[Rank 2] Trainer log: {'loss': 0.6926, 'grad_norm': 6.026294231414795, 'learning_rate': 2.7305112801404277e-06} [Rank 3] Trainer log: {'loss': 0.6926, 'grad_norm': 6.026294231414795, 'learning_rate': 2.7305112801404277e-06} [Rank 0] Trainer log: {'loss': 0.6926, 'grad_norm': 6.026294231414795, 'learning_rate': 2.7305112801404277e-06} {'loss': 0.6926, 'grad_norm': 6.026294231414795, 'learning_rate': 2.7305112801404277e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.6157, 'grad_norm': 9.39620590209961, 'learning_rate': 2.7258303024790922e-06}[Rank 3] Trainer log: {'loss': 0.6157, 'grad_norm': 9.39620590209961, 'learning_rate': 2.7258303024790922e-06}[Rank 0] Trainer log: {'loss': 0.6157, 'grad_norm': 9.39620590209961, 'learning_rate': 2.7258303024790922e-06} [Rank 1] Trainer log: {'loss': 0.6157, 'grad_norm': 9.39620590209961, 'learning_rate': 2.7258303024790922e-06} {'loss': 0.6157, 'grad_norm': 9.39620590209961, 'learning_rate': 2.7258303024790922e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.8359, 'grad_norm': 5.290535926818848, 'learning_rate': 2.7211527073872846e-06} [Rank 3] Trainer log: {'loss': 0.8359, 'grad_norm': 5.290535926818848, 'learning_rate': 2.7211527073872846e-06}[Rank 0] Trainer log: {'loss': 0.8359, 'grad_norm': 5.290535926818848, 'learning_rate': 2.7211527073872846e-06} [Rank 1] Trainer log: {'loss': 0.8359, 'grad_norm': 5.290535926818848, 'learning_rate': 2.7211527073872846e-06} {'loss': 0.8359, 'grad_norm': 5.290535926818848, 'learning_rate': 2.7211527073872846e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.9614, 'grad_norm': 5.314791202545166, 'learning_rate': 2.7164784970401426e-06} [Rank 0] Trainer log: {'loss': 0.9614, 'grad_norm': 5.314791202545166, 'learning_rate': 2.7164784970401426e-06} [Rank 1] Trainer log: {'loss': 0.9614, 'grad_norm': 5.314791202545166, 'learning_rate': 2.7164784970401426e-06} [Rank 3] Trainer log: {'loss': 0.9614, 'grad_norm': 5.314791202545166, 'learning_rate': 2.7164784970401426e-06} {'loss': 0.9614, 'grad_norm': 5.314791202545166, 'learning_rate': 2.7164784970401426e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.8878, 'grad_norm': 2.926307201385498, 'learning_rate': 2.7118076736112243e-06}[Rank 3] Trainer log: {'loss': 0.8878, 'grad_norm': 2.926307201385498, 'learning_rate': 2.7118076736112243e-06} [Rank 0] Trainer log: {'loss': 0.8878, 'grad_norm': 2.926307201385498, 'learning_rate': 2.7118076736112243e-06} [Rank 1] Trainer log: {'loss': 0.8878, 'grad_norm': 2.926307201385498, 'learning_rate': 2.7118076736112243e-06} {'loss': 0.8878, 'grad_norm': 2.926307201385498, 'learning_rate': 2.7118076736112243e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.6503, 'grad_norm': 4.50838041305542, 'learning_rate': 2.70714023927251e-06}[Rank 0] Trainer log: {'loss': 0.6503, 'grad_norm': 4.50838041305542, 'learning_rate': 2.70714023927251e-06} [Rank 3] Trainer log: {'loss': 0.6503, 'grad_norm': 4.50838041305542, 'learning_rate': 2.70714023927251e-06}[Rank 1] Trainer log: {'loss': 0.6503, 'grad_norm': 4.50838041305542, 'learning_rate': 2.70714023927251e-06} {'loss': 0.6503, 'grad_norm': 4.50838041305542, 'learning_rate': 2.70714023927251e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.9704, 'grad_norm': 6.933302879333496, 'learning_rate': 2.7024761961944123e-06} [Rank 0] Trainer log: {'loss': 0.9704, 'grad_norm': 6.933302879333496, 'learning_rate': 2.7024761961944123e-06}[Rank 3] Trainer log: {'loss': 0.9704, 'grad_norm': 6.933302879333496, 'learning_rate': 2.7024761961944123e-06} [Rank 1] Trainer log: {'loss': 0.9704, 'grad_norm': 6.933302879333496, 'learning_rate': 2.7024761961944123e-06} {'loss': 0.9704, 'grad_norm': 6.933302879333496, 'learning_rate': 2.7024761961944123e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.9173, 'grad_norm': 13.793785095214844, 'learning_rate': 2.6978155465457667e-06}[Rank 0] Trainer log: {'loss': 0.9173, 'grad_norm': 13.793785095214844, 'learning_rate': 2.6978155465457667e-06} [Rank 2] Trainer log: {'loss': 0.9173, 'grad_norm': 13.793785095214844, 'learning_rate': 2.6978155465457667e-06} [Rank 3] Trainer log: {'loss': 0.9173, 'grad_norm': 13.793785095214844, 'learning_rate': 2.6978155465457667e-06} {'loss': 0.9173, 'grad_norm': 13.793785095214844, 'learning_rate': 2.6978155465457667e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.7249, 'grad_norm': 7.591262340545654, 'learning_rate': 2.693158292493819e-06}[Rank 3] Trainer log: {'loss': 0.7249, 'grad_norm': 7.591262340545654, 'learning_rate': 2.693158292493819e-06} [Rank 1] Trainer log: {'loss': 0.7249, 'grad_norm': 7.591262340545654, 'learning_rate': 2.693158292493819e-06} [Rank 0] Trainer log: {'loss': 0.7249, 'grad_norm': 7.591262340545654, 'learning_rate': 2.693158292493819e-06} {'loss': 0.7249, 'grad_norm': 7.591262340545654, 'learning_rate': 2.693158292493819e-06, 'epoch': 0.77} [Rank 2] Trainer log: {'loss': 0.8097, 'grad_norm': 7.351962089538574, 'learning_rate': 2.6885044362042524e-06}[Rank 0] Trainer log: {'loss': 0.8097, 'grad_norm': 7.351962089538574, 'learning_rate': 2.6885044362042524e-06}[Rank 3] Trainer log: {'loss': 0.8097, 'grad_norm': 7.351962089538574, 'learning_rate': 2.6885044362042524e-06} {'loss': 0.8097, 'grad_norm': 7.351962089538574, 'learning_rate': 2.6885044362042524e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.8097, 'grad_norm': 7.351962089538574, 'learning_rate': 2.6885044362042524e-06} [Rank 0] Trainer log: {'loss': 0.9659, 'grad_norm': 4.796856880187988, 'learning_rate': 2.6838539798411565e-06}[Rank 2] Trainer log: {'loss': 0.9659, 'grad_norm': 4.796856880187988, 'learning_rate': 2.6838539798411565e-06}[Rank 3] Trainer log: {'loss': 0.9659, 'grad_norm': 4.796856880187988, 'learning_rate': 2.6838539798411565e-06} [Rank 1] Trainer log: {'loss': 0.9659, 'grad_norm': 4.796856880187988, 'learning_rate': 2.6838539798411565e-06} {'loss': 0.9659, 'grad_norm': 4.796856880187988, 'learning_rate': 2.6838539798411565e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.7397, 'grad_norm': 3.9981260299682617, 'learning_rate': 2.679206925567044e-06}[Rank 3] Trainer log: {'loss': 0.7397, 'grad_norm': 3.9981260299682617, 'learning_rate': 2.679206925567044e-06}[Rank 0] Trainer log: {'loss': 0.7397, 'grad_norm': 3.9981260299682617, 'learning_rate': 2.679206925567044e-06} [Rank 2] Trainer log: {'loss': 0.7397, 'grad_norm': 3.9981260299682617, 'learning_rate': 2.679206925567044e-06} {'loss': 0.7397, 'grad_norm': 3.9981260299682617, 'learning_rate': 2.679206925567044e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 0.7087, 'grad_norm': 2.4776275157928467, 'learning_rate': 2.6745632755428508e-06}[Rank 1] Trainer log: {'loss': 0.7087, 'grad_norm': 2.4776275157928467, 'learning_rate': 2.6745632755428508e-06}[Rank 2] Trainer log: {'loss': 0.7087, 'grad_norm': 2.4776275157928467, 'learning_rate': 2.6745632755428508e-06} [Rank 0] Trainer log: {'loss': 0.7087, 'grad_norm': 2.4776275157928467, 'learning_rate': 2.6745632755428508e-06} {'loss': 0.7087, 'grad_norm': 2.4776275157928467, 'learning_rate': 2.6745632755428508e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 0.7588, 'grad_norm': 6.309602737426758, 'learning_rate': 2.6699230319279236e-06} [Rank 2] Trainer log: {'loss': 0.7588, 'grad_norm': 6.309602737426758, 'learning_rate': 2.6699230319279236e-06} [Rank 1] Trainer log: {'loss': 0.7588, 'grad_norm': 6.309602737426758, 'learning_rate': 2.6699230319279236e-06} [Rank 0] Trainer log: {'loss': 0.7588, 'grad_norm': 6.309602737426758, 'learning_rate': 2.6699230319279236e-06} {'loss': 0.7588, 'grad_norm': 6.309602737426758, 'learning_rate': 2.6699230319279236e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 0.9096, 'grad_norm': 2.425522804260254, 'learning_rate': 2.6652861968800224e-06}[Rank 0] Trainer log: {'loss': 0.9096, 'grad_norm': 2.425522804260254, 'learning_rate': 2.6652861968800224e-06}[Rank 1] Trainer log: {'loss': 0.9096, 'grad_norm': 2.425522804260254, 'learning_rate': 2.6652861968800224e-06} [Rank 2] Trainer log: {'loss': 0.9096, 'grad_norm': 2.425522804260254, 'learning_rate': 2.6652861968800224e-06} {'loss': 0.9096, 'grad_norm': 2.425522804260254, 'learning_rate': 2.6652861968800224e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 1.0183, 'grad_norm': 7.8625569343566895, 'learning_rate': 2.6606527725553334e-06} [Rank 1] Trainer log: {'loss': 1.0183, 'grad_norm': 7.8625569343566895, 'learning_rate': 2.6606527725553334e-06}[Rank 2] Trainer log: {'loss': 1.0183, 'grad_norm': 7.8625569343566895, 'learning_rate': 2.6606527725553334e-06} [Rank 0] Trainer log: {'loss': 1.0183, 'grad_norm': 7.8625569343566895, 'learning_rate': 2.6606527725553334e-06} {'loss': 1.0183, 'grad_norm': 7.8625569343566895, 'learning_rate': 2.6606527725553334e-06, 'epoch': 0.77} [Rank 0] Trainer log: {'loss': 0.9075, 'grad_norm': 2.8774094581604004, 'learning_rate': 2.65602276110845e-06}[Rank 3] Trainer log: {'loss': 0.9075, 'grad_norm': 2.8774094581604004, 'learning_rate': 2.65602276110845e-06} [Rank 2] Trainer log: {'loss': 0.9075, 'grad_norm': 2.8774094581604004, 'learning_rate': 2.65602276110845e-06} [Rank 1] Trainer log: {'loss': 0.9075, 'grad_norm': 2.8774094581604004, 'learning_rate': 2.65602276110845e-06}{'loss': 0.9075, 'grad_norm': 2.8774094581604004, 'learning_rate': 2.65602276110845e-06, 'epoch': 0.77} [Rank 0] Trainer log: {'loss': 0.8685, 'grad_norm': 2.108820915222168, 'learning_rate': 2.6513961646923734e-06}[Rank 3] Trainer log: {'loss': 0.8685, 'grad_norm': 2.108820915222168, 'learning_rate': 2.6513961646923734e-06}[Rank 2] Trainer log: {'loss': 0.8685, 'grad_norm': 2.108820915222168, 'learning_rate': 2.6513961646923734e-06} [Rank 1] Trainer log: {'loss': 0.8685, 'grad_norm': 2.108820915222168, 'learning_rate': 2.6513961646923734e-06} {'loss': 0.8685, 'grad_norm': 2.108820915222168, 'learning_rate': 2.6513961646923734e-06, 'epoch': 0.77} [Rank 3] Trainer log: {'loss': 0.7306, 'grad_norm': 11.85863971710205, 'learning_rate': 2.64677298545853e-06} [Rank 2] Trainer log: {'loss': 0.7306, 'grad_norm': 11.85863971710205, 'learning_rate': 2.64677298545853e-06} [Rank 0] Trainer log: {'loss': 0.7306, 'grad_norm': 11.85863971710205, 'learning_rate': 2.64677298545853e-06}[Rank 1] Trainer log: {'loss': 0.7306, 'grad_norm': 11.85863971710205, 'learning_rate': 2.64677298545853e-06} {'loss': 0.7306, 'grad_norm': 11.85863971710205, 'learning_rate': 2.64677298545853e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.6912, 'grad_norm': 2.2956137657165527, 'learning_rate': 2.642153225556745e-06}[Rank 0] Trainer log: {'loss': 0.6912, 'grad_norm': 2.2956137657165527, 'learning_rate': 2.642153225556745e-06} [Rank 3] Trainer log: {'loss': 0.6912, 'grad_norm': 2.2956137657165527, 'learning_rate': 2.642153225556745e-06} [Rank 2] Trainer log: {'loss': 0.6912, 'grad_norm': 2.2956137657165527, 'learning_rate': 2.642153225556745e-06} {'loss': 0.6912, 'grad_norm': 2.2956137657165527, 'learning_rate': 2.642153225556745e-06, 'epoch': 0.77} [Rank 1] Trainer log: {'loss': 0.9649, 'grad_norm': 2.4154276847839355, 'learning_rate': 2.6375368871352557e-06} [Rank 3] Trainer log: {'loss': 0.9649, 'grad_norm': 2.4154276847839355, 'learning_rate': 2.6375368871352557e-06} [Rank 0] Trainer log: {'loss': 0.9649, 'grad_norm': 2.4154276847839355, 'learning_rate': 2.6375368871352557e-06}[Rank 2] Trainer log: {'loss': 0.9649, 'grad_norm': 2.4154276847839355, 'learning_rate': 2.6375368871352557e-06} {'loss': 0.9649, 'grad_norm': 2.4154276847839355, 'learning_rate': 2.6375368871352557e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.8723, 'grad_norm': 5.680726528167725, 'learning_rate': 2.632923972340715e-06}[Rank 1] Trainer log: {'loss': 0.8723, 'grad_norm': 5.680726528167725, 'learning_rate': 2.632923972340715e-06} [Rank 0] Trainer log: {'loss': 0.8723, 'grad_norm': 5.680726528167725, 'learning_rate': 2.632923972340715e-06} [Rank 2] Trainer log: {'loss': 0.8723, 'grad_norm': 5.680726528167725, 'learning_rate': 2.632923972340715e-06} {'loss': 0.8723, 'grad_norm': 5.680726528167725, 'learning_rate': 2.632923972340715e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.7253, 'grad_norm': 5.155200004577637, 'learning_rate': 2.628314483318178e-06}[Rank 1] Trainer log: {'loss': 0.7253, 'grad_norm': 5.155200004577637, 'learning_rate': 2.628314483318178e-06}[Rank 2] Trainer log: {'loss': 0.7253, 'grad_norm': 5.155200004577637, 'learning_rate': 2.628314483318178e-06} [Rank 3] Trainer log: {'loss': 0.7253, 'grad_norm': 5.155200004577637, 'learning_rate': 2.628314483318178e-06} {'loss': 0.7253, 'grad_norm': 5.155200004577637, 'learning_rate': 2.628314483318178e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.952, 'grad_norm': 2.2268335819244385, 'learning_rate': 2.6237084222111054e-06}[Rank 3] Trainer log: {'loss': 0.952, 'grad_norm': 2.2268335819244385, 'learning_rate': 2.6237084222111054e-06} [Rank 2] Trainer log: {'loss': 0.952, 'grad_norm': 2.2268335819244385, 'learning_rate': 2.6237084222111054e-06} [Rank 1] Trainer log: {'loss': 0.952, 'grad_norm': 2.2268335819244385, 'learning_rate': 2.6237084222111054e-06} {'loss': 0.952, 'grad_norm': 2.2268335819244385, 'learning_rate': 2.6237084222111054e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 1.018, 'grad_norm': 7.705598831176758, 'learning_rate': 2.6191057911613682e-06}[Rank 1] Trainer log: {'loss': 1.018, 'grad_norm': 7.705598831176758, 'learning_rate': 2.6191057911613682e-06} [Rank 0] Trainer log: {'loss': 1.018, 'grad_norm': 7.705598831176758, 'learning_rate': 2.6191057911613682e-06}[Rank 3] Trainer log: {'loss': 1.018, 'grad_norm': 7.705598831176758, 'learning_rate': 2.6191057911613682e-06} {'loss': 1.018, 'grad_norm': 7.705598831176758, 'learning_rate': 2.6191057911613682e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.777, 'grad_norm': 2.4273834228515625, 'learning_rate': 2.614506592309245e-06} [Rank 0] Trainer log: {'loss': 0.777, 'grad_norm': 2.4273834228515625, 'learning_rate': 2.614506592309245e-06}[Rank 1] Trainer log: {'loss': 0.777, 'grad_norm': 2.4273834228515625, 'learning_rate': 2.614506592309245e-06} [Rank 2] Trainer log: {'loss': 0.777, 'grad_norm': 2.4273834228515625, 'learning_rate': 2.614506592309245e-06} {'loss': 0.777, 'grad_norm': 2.4273834228515625, 'learning_rate': 2.614506592309245e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.9034, 'grad_norm': 3.3168742656707764, 'learning_rate': 2.6099108277934105e-06} [Rank 2] Trainer log: {'loss': 0.9034, 'grad_norm': 3.3168742656707764, 'learning_rate': 2.6099108277934105e-06}[Rank 0] Trainer log: {'loss': 0.9034, 'grad_norm': 3.3168742656707764, 'learning_rate': 2.6099108277934105e-06} [Rank 1] Trainer log: {'loss': 0.9034, 'grad_norm': 3.3168742656707764, 'learning_rate': 2.6099108277934105e-06} {'loss': 0.9034, 'grad_norm': 3.3168742656707764, 'learning_rate': 2.6099108277934105e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.7912, 'grad_norm': 6.506759166717529, 'learning_rate': 2.605318499750944e-06}[Rank 0] Trainer log: {'loss': 0.7912, 'grad_norm': 6.506759166717529, 'learning_rate': 2.605318499750944e-06} [Rank 2] Trainer log: {'loss': 0.7912, 'grad_norm': 6.506759166717529, 'learning_rate': 2.605318499750944e-06} [Rank 1] Trainer log: {'loss': 0.7912, 'grad_norm': 6.506759166717529, 'learning_rate': 2.605318499750944e-06} {'loss': 0.7912, 'grad_norm': 6.506759166717529, 'learning_rate': 2.605318499750944e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.6277, 'grad_norm': 4.410257816314697, 'learning_rate': 2.600729610317334e-06}[Rank 2] Trainer log: {'loss': 0.6277, 'grad_norm': 4.410257816314697, 'learning_rate': 2.600729610317334e-06}[Rank 3] Trainer log: {'loss': 0.6277, 'grad_norm': 4.410257816314697, 'learning_rate': 2.600729610317334e-06} [Rank 1] Trainer log: {'loss': 0.6277, 'grad_norm': 4.410257816314697, 'learning_rate': 2.600729610317334e-06} {'loss': 0.6277, 'grad_norm': 4.410257816314697, 'learning_rate': 2.600729610317334e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.8898, 'grad_norm': 5.257806777954102, 'learning_rate': 2.5961441616264626e-06} [Rank 0] Trainer log: {'loss': 0.8898, 'grad_norm': 5.257806777954102, 'learning_rate': 2.5961441616264626e-06}[Rank 1] Trainer log: {'loss': 0.8898, 'grad_norm': 5.257806777954102, 'learning_rate': 2.5961441616264626e-06} [Rank 2] Trainer log: {'loss': 0.8898, 'grad_norm': 5.257806777954102, 'learning_rate': 2.5961441616264626e-06} {'loss': 0.8898, 'grad_norm': 5.257806777954102, 'learning_rate': 2.5961441616264626e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.7464, 'grad_norm': 5.138516426086426, 'learning_rate': 2.591562155810611e-06}[Rank 1] Trainer log: {'loss': 0.7464, 'grad_norm': 5.138516426086426, 'learning_rate': 2.591562155810611e-06} [Rank 0] Trainer log: {'loss': 0.7464, 'grad_norm': 5.138516426086426, 'learning_rate': 2.591562155810611e-06}[Rank 3] Trainer log: {'loss': 0.7464, 'grad_norm': 5.138516426086426, 'learning_rate': 2.591562155810611e-06} {'loss': 0.7464, 'grad_norm': 5.138516426086426, 'learning_rate': 2.591562155810611e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.7464, 'grad_norm': 2.94530987739563, 'learning_rate': 2.5869835950004683e-06}[Rank 2] Trainer log: {'loss': 0.7464, 'grad_norm': 2.94530987739563, 'learning_rate': 2.5869835950004683e-06}[Rank 0] Trainer log: {'loss': 0.7464, 'grad_norm': 2.94530987739563, 'learning_rate': 2.5869835950004683e-06} [Rank 1] Trainer log: {'loss': 0.7464, 'grad_norm': 2.94530987739563, 'learning_rate': 2.5869835950004683e-06} {'loss': 0.7464, 'grad_norm': 2.94530987739563, 'learning_rate': 2.5869835950004683e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.9615, 'grad_norm': 8.442651748657227, 'learning_rate': 2.5824084813251118e-06}[Rank 2] Trainer log: {'loss': 0.9615, 'grad_norm': 8.442651748657227, 'learning_rate': 2.5824084813251118e-06}[Rank 0] Trainer log: {'loss': 0.9615, 'grad_norm': 8.442651748657227, 'learning_rate': 2.5824084813251118e-06} {'loss': 0.9615, 'grad_norm': 8.442651748657227, 'learning_rate': 2.5824084813251118e-06, 'epoch': 0.78} [Rank 1] Trainer log: {'loss': 0.9615, 'grad_norm': 8.442651748657227, 'learning_rate': 2.5824084813251118e-06} [Rank 1] Trainer log: {'loss': 0.6831, 'grad_norm': 5.215147018432617, 'learning_rate': 2.5778368169120205e-06}[Rank 3] Trainer log: {'loss': 0.6831, 'grad_norm': 5.215147018432617, 'learning_rate': 2.5778368169120205e-06}[Rank 2] Trainer log: {'loss': 0.6831, 'grad_norm': 5.215147018432617, 'learning_rate': 2.5778368169120205e-06} [Rank 0] Trainer log: {'loss': 0.6831, 'grad_norm': 5.215147018432617, 'learning_rate': 2.5778368169120205e-06} {'loss': 0.6831, 'grad_norm': 5.215147018432617, 'learning_rate': 2.5778368169120205e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.9037, 'grad_norm': 2.013399124145508, 'learning_rate': 2.5732686038870736e-06}[Rank 3] Trainer log: {'loss': 0.9037, 'grad_norm': 2.013399124145508, 'learning_rate': 2.5732686038870736e-06} [Rank 2] Trainer log: {'loss': 0.9037, 'grad_norm': 2.013399124145508, 'learning_rate': 2.5732686038870736e-06} [Rank 1] Trainer log: {'loss': 0.9037, 'grad_norm': 2.013399124145508, 'learning_rate': 2.5732686038870736e-06} {'loss': 0.9037, 'grad_norm': 2.013399124145508, 'learning_rate': 2.5732686038870736e-06, 'epoch': 0.78} [Rank 1] Trainer log: {'loss': 0.9459, 'grad_norm': 8.995712280273438, 'learning_rate': 2.5687038443745373e-06} [Rank 0] Trainer log: {'loss': 0.9459, 'grad_norm': 8.995712280273438, 'learning_rate': 2.5687038443745373e-06}[Rank 2] Trainer log: {'loss': 0.9459, 'grad_norm': 8.995712280273438, 'learning_rate': 2.5687038443745373e-06}[Rank 3] Trainer log: {'loss': 0.9459, 'grad_norm': 8.995712280273438, 'learning_rate': 2.5687038443745373e-06} {'loss': 0.9459, 'grad_norm': 8.995712280273438, 'learning_rate': 2.5687038443745373e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.6027, 'grad_norm': 5.679854393005371, 'learning_rate': 2.5641425404970723e-06}[Rank 3] Trainer log: {'loss': 0.6027, 'grad_norm': 5.679854393005371, 'learning_rate': 2.5641425404970723e-06} [Rank 1] Trainer log: {'loss': 0.6027, 'grad_norm': 5.679854393005371, 'learning_rate': 2.5641425404970723e-06} [Rank 2] Trainer log: {'loss': 0.6027, 'grad_norm': 5.679854393005371, 'learning_rate': 2.5641425404970723e-06} {'loss': 0.6027, 'grad_norm': 5.679854393005371, 'learning_rate': 2.5641425404970723e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.5619, 'grad_norm': 5.531418800354004, 'learning_rate': 2.559584694375742e-06}[Rank 0] Trainer log: {'loss': 0.5619, 'grad_norm': 5.531418800354004, 'learning_rate': 2.559584694375742e-06}[Rank 3] Trainer log: {'loss': 0.5619, 'grad_norm': 5.531418800354004, 'learning_rate': 2.559584694375742e-06} {'loss': 0.5619, 'grad_norm': 5.531418800354004, 'learning_rate': 2.559584694375742e-06, 'epoch': 0.78} [Rank 1] Trainer log: {'loss': 0.5619, 'grad_norm': 5.531418800354004, 'learning_rate': 2.559584694375742e-06} [Rank 3] Trainer log: {'loss': 0.945, 'grad_norm': 4.008352279663086, 'learning_rate': 2.555030308129992e-06} [Rank 2] Trainer log: {'loss': 0.945, 'grad_norm': 4.008352279663086, 'learning_rate': 2.555030308129992e-06} [Rank 0] Trainer log: {'loss': 0.945, 'grad_norm': 4.008352279663086, 'learning_rate': 2.555030308129992e-06}[Rank 1] Trainer log: {'loss': 0.945, 'grad_norm': 4.008352279663086, 'learning_rate': 2.555030308129992e-06} {'loss': 0.945, 'grad_norm': 4.008352279663086, 'learning_rate': 2.555030308129992e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.9689, 'grad_norm': 6.747896194458008, 'learning_rate': 2.5504793838776585e-06} [Rank 0] Trainer log: {'loss': 0.9689, 'grad_norm': 6.747896194458008, 'learning_rate': 2.5504793838776585e-06}[Rank 2] Trainer log: {'loss': 0.9689, 'grad_norm': 6.747896194458008, 'learning_rate': 2.5504793838776585e-06} [Rank 1] Trainer log: {'loss': 0.9689, 'grad_norm': 6.747896194458008, 'learning_rate': 2.5504793838776585e-06} {'loss': 0.9689, 'grad_norm': 6.747896194458008, 'learning_rate': 2.5504793838776585e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.8183, 'grad_norm': 2.4416701793670654, 'learning_rate': 2.54593192373498e-06}[Rank 2] Trainer log: {'loss': 0.8183, 'grad_norm': 2.4416701793670654, 'learning_rate': 2.54593192373498e-06} [Rank 0] Trainer log: {'loss': 0.8183, 'grad_norm': 2.4416701793670654, 'learning_rate': 2.54593192373498e-06} [Rank 1] Trainer log: {'loss': 0.8183, 'grad_norm': 2.4416701793670654, 'learning_rate': 2.54593192373498e-06} {'loss': 0.8183, 'grad_norm': 2.4416701793670654, 'learning_rate': 2.54593192373498e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.8115, 'grad_norm': 5.6547441482543945, 'learning_rate': 2.541387929816568e-06}[Rank 3] Trainer log: {'loss': 0.8115, 'grad_norm': 5.6547441482543945, 'learning_rate': 2.541387929816568e-06} [Rank 1] Trainer log: {'loss': 0.8115, 'grad_norm': 5.6547441482543945, 'learning_rate': 2.541387929816568e-06} [Rank 2] Trainer log: {'loss': 0.8115, 'grad_norm': 5.6547441482543945, 'learning_rate': 2.541387929816568e-06} {'loss': 0.8115, 'grad_norm': 5.6547441482543945, 'learning_rate': 2.541387929816568e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.6925, 'grad_norm': 13.64535140991211, 'learning_rate': 2.536847404235433e-06}[Rank 3] Trainer log: {'loss': 0.6925, 'grad_norm': 13.64535140991211, 'learning_rate': 2.536847404235433e-06}[Rank 1] Trainer log: {'loss': 0.6925, 'grad_norm': 13.64535140991211, 'learning_rate': 2.536847404235433e-06} [Rank 2] Trainer log: {'loss': 0.6925, 'grad_norm': 13.64535140991211, 'learning_rate': 2.536847404235433e-06} {'loss': 0.6925, 'grad_norm': 13.64535140991211, 'learning_rate': 2.536847404235433e-06, 'epoch': 0.78} [Rank 1] Trainer log: {'loss': 0.8579, 'grad_norm': 5.868387699127197, 'learning_rate': 2.5323103491029734e-06}[Rank 0] Trainer log: {'loss': 0.8579, 'grad_norm': 5.868387699127197, 'learning_rate': 2.5323103491029734e-06}[Rank 3] Trainer log: {'loss': 0.8579, 'grad_norm': 5.868387699127197, 'learning_rate': 2.5323103491029734e-06} [Rank 2] Trainer log: {'loss': 0.8579, 'grad_norm': 5.868387699127197, 'learning_rate': 2.5323103491029734e-06} {'loss': 0.8579, 'grad_norm': 5.868387699127197, 'learning_rate': 2.5323103491029734e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.6801, 'grad_norm': 2.0132617950439453, 'learning_rate': 2.527776766528968e-06}[Rank 0] Trainer log: {'loss': 0.6801, 'grad_norm': 2.0132617950439453, 'learning_rate': 2.527776766528968e-06} [Rank 1] Trainer log: {'loss': 0.6801, 'grad_norm': 2.0132617950439453, 'learning_rate': 2.527776766528968e-06} [Rank 2] Trainer log: {'loss': 0.6801, 'grad_norm': 2.0132617950439453, 'learning_rate': 2.527776766528968e-06} {'loss': 0.6801, 'grad_norm': 2.0132617950439453, 'learning_rate': 2.527776766528968e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.7569, 'grad_norm': 3.3288326263427734, 'learning_rate': 2.5232466586215777e-06} [Rank 0] Trainer log: {'loss': 0.7569, 'grad_norm': 3.3288326263427734, 'learning_rate': 2.5232466586215777e-06}[Rank 1] Trainer log: {'loss': 0.7569, 'grad_norm': 3.3288326263427734, 'learning_rate': 2.5232466586215777e-06} [Rank 2] Trainer log: {'loss': 0.7569, 'grad_norm': 3.3288326263427734, 'learning_rate': 2.5232466586215777e-06} {'loss': 0.7569, 'grad_norm': 3.3288326263427734, 'learning_rate': 2.5232466586215777e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.8099, 'grad_norm': 4.210941791534424, 'learning_rate': 2.5187200274873603e-06}[Rank 3] Trainer log: {'loss': 0.8099, 'grad_norm': 4.210941791534424, 'learning_rate': 2.5187200274873603e-06}[Rank 0] Trainer log: {'loss': 0.8099, 'grad_norm': 4.210941791534424, 'learning_rate': 2.5187200274873603e-06} [Rank 1] Trainer log: {'loss': 0.8099, 'grad_norm': 4.210941791534424, 'learning_rate': 2.5187200274873603e-06} {'loss': 0.8099, 'grad_norm': 4.210941791534424, 'learning_rate': 2.5187200274873603e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.8239, 'grad_norm': 3.4376258850097656, 'learning_rate': 2.514196875231246e-06}[Rank 0] Trainer log: {'loss': 0.8239, 'grad_norm': 3.4376258850097656, 'learning_rate': 2.514196875231246e-06}[Rank 1] Trainer log: {'loss': 0.8239, 'grad_norm': 3.4376258850097656, 'learning_rate': 2.514196875231246e-06} [Rank 2] Trainer log: {'loss': 0.8239, 'grad_norm': 3.4376258850097656, 'learning_rate': 2.514196875231246e-06} {'loss': 0.8239, 'grad_norm': 3.4376258850097656, 'learning_rate': 2.514196875231246e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.8707, 'grad_norm': 3.675140142440796, 'learning_rate': 2.5096772039565487e-06}[Rank 0] Trainer log: {'loss': 0.8707, 'grad_norm': 3.675140142440796, 'learning_rate': 2.5096772039565487e-06} [Rank 1] Trainer log: {'loss': 0.8707, 'grad_norm': 3.675140142440796, 'learning_rate': 2.5096772039565487e-06}[Rank 2] Trainer log: {'loss': 0.8707, 'grad_norm': 3.675140142440796, 'learning_rate': 2.5096772039565487e-06} {'loss': 0.8707, 'grad_norm': 3.675140142440796, 'learning_rate': 2.5096772039565487e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.9104, 'grad_norm': 3.339790105819702, 'learning_rate': 2.505161015764971e-06} [Rank 0] Trainer log: {'loss': 0.9104, 'grad_norm': 3.339790105819702, 'learning_rate': 2.505161015764971e-06} [Rank 1] Trainer log: {'loss': 0.9104, 'grad_norm': 3.339790105819702, 'learning_rate': 2.505161015764971e-06}[Rank 2] Trainer log: {'loss': 0.9104, 'grad_norm': 3.339790105819702, 'learning_rate': 2.505161015764971e-06} {'loss': 0.9104, 'grad_norm': 3.339790105819702, 'learning_rate': 2.505161015764971e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.4769, 'grad_norm': 3.929807186126709, 'learning_rate': 2.500648312756584e-06}[Rank 3] Trainer log: {'loss': 0.4769, 'grad_norm': 3.929807186126709, 'learning_rate': 2.500648312756584e-06}[Rank 1] Trainer log: {'loss': 0.4769, 'grad_norm': 3.929807186126709, 'learning_rate': 2.500648312756584e-06} [Rank 2] Trainer log: {'loss': 0.4769, 'grad_norm': 3.929807186126709, 'learning_rate': 2.500648312756584e-06} {'loss': 0.4769, 'grad_norm': 3.929807186126709, 'learning_rate': 2.500648312756584e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.8494, 'grad_norm': 4.139101982116699, 'learning_rate': 2.4961390970298507e-06}[Rank 1] Trainer log: {'loss': 0.8494, 'grad_norm': 4.139101982116699, 'learning_rate': 2.4961390970298507e-06} [Rank 3] Trainer log: {'loss': 0.8494, 'grad_norm': 4.139101982116699, 'learning_rate': 2.4961390970298507e-06} [Rank 2] Trainer log: {'loss': 0.8494, 'grad_norm': 4.139101982116699, 'learning_rate': 2.4961390970298507e-06} {'loss': 0.8494, 'grad_norm': 4.139101982116699, 'learning_rate': 2.4961390970298507e-06, 'epoch': 0.78} [Rank 1] Trainer log: {'loss': 0.7571, 'grad_norm': 7.194714546203613, 'learning_rate': 2.4916333706816008e-06}[Rank 3] Trainer log: {'loss': 0.7571, 'grad_norm': 7.194714546203613, 'learning_rate': 2.4916333706816008e-06}[Rank 0] Trainer log: {'loss': 0.7571, 'grad_norm': 7.194714546203613, 'learning_rate': 2.4916333706816008e-06} [Rank 2] Trainer log: {'loss': 0.7571, 'grad_norm': 7.194714546203613, 'learning_rate': 2.4916333706816008e-06} {'loss': 0.7571, 'grad_norm': 7.194714546203613, 'learning_rate': 2.4916333706816008e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.7245, 'grad_norm': 6.550088405609131, 'learning_rate': 2.487131135807053e-06} [Rank 0] Trainer log: {'loss': 0.7245, 'grad_norm': 6.550088405609131, 'learning_rate': 2.487131135807053e-06}[Rank 1] Trainer log: {'loss': 0.7245, 'grad_norm': 6.550088405609131, 'learning_rate': 2.487131135807053e-06} [Rank 3] Trainer log: {'loss': 0.7245, 'grad_norm': 6.550088405609131, 'learning_rate': 2.487131135807053e-06} {'loss': 0.7245, 'grad_norm': 6.550088405609131, 'learning_rate': 2.487131135807053e-06, 'epoch': 0.78} [Rank 1] Trainer log: {'loss': 0.9373, 'grad_norm': 2.047142744064331, 'learning_rate': 2.482632394499792e-06} [Rank 3] Trainer log: {'loss': 0.9373, 'grad_norm': 2.047142744064331, 'learning_rate': 2.482632394499792e-06} [Rank 0] Trainer log: {'loss': 0.9373, 'grad_norm': 2.047142744064331, 'learning_rate': 2.482632394499792e-06}[Rank 2] Trainer log: {'loss': 0.9373, 'grad_norm': 2.047142744064331, 'learning_rate': 2.482632394499792e-06} {'loss': 0.9373, 'grad_norm': 2.047142744064331, 'learning_rate': 2.482632394499792e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.8626, 'grad_norm': 6.850427627563477, 'learning_rate': 2.4781371488517824e-06}[Rank 1] Trainer log: {'loss': 0.8626, 'grad_norm': 6.850427627563477, 'learning_rate': 2.4781371488517824e-06}[Rank 3] Trainer log: {'loss': 0.8626, 'grad_norm': 6.850427627563477, 'learning_rate': 2.4781371488517824e-06} [Rank 2] Trainer log: {'loss': 0.8626, 'grad_norm': 6.850427627563477, 'learning_rate': 2.4781371488517824e-06} {'loss': 0.8626, 'grad_norm': 6.850427627563477, 'learning_rate': 2.4781371488517824e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 0.8963, 'grad_norm': 3.6823394298553467, 'learning_rate': 2.473645400953366e-06}[Rank 3] Trainer log: {'loss': 0.8963, 'grad_norm': 3.6823394298553467, 'learning_rate': 2.473645400953366e-06} [Rank 2] Trainer log: {'loss': 0.8963, 'grad_norm': 3.6823394298553467, 'learning_rate': 2.473645400953366e-06} [Rank 1] Trainer log: {'loss': 0.8963, 'grad_norm': 3.6823394298553467, 'learning_rate': 2.473645400953366e-06} {'loss': 0.8963, 'grad_norm': 3.6823394298553467, 'learning_rate': 2.473645400953366e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.92, 'grad_norm': 4.04418420791626, 'learning_rate': 2.4691571528932555e-06} [Rank 1] Trainer log: {'loss': 0.92, 'grad_norm': 4.04418420791626, 'learning_rate': 2.4691571528932555e-06}[Rank 0] Trainer log: {'loss': 0.92, 'grad_norm': 4.04418420791626, 'learning_rate': 2.4691571528932555e-06} [Rank 2] Trainer log: {'loss': 0.92, 'grad_norm': 4.04418420791626, 'learning_rate': 2.4691571528932555e-06} {'loss': 0.92, 'grad_norm': 4.04418420791626, 'learning_rate': 2.4691571528932555e-06, 'epoch': 0.78} [Rank 0] Trainer log: {'loss': 1.0233, 'grad_norm': 5.7810258865356445, 'learning_rate': 2.4646724067585302e-06}[Rank 3] Trainer log: {'loss': 1.0233, 'grad_norm': 5.7810258865356445, 'learning_rate': 2.4646724067585302e-06} [Rank 1] Trainer log: {'loss': 1.0233, 'grad_norm': 5.7810258865356445, 'learning_rate': 2.4646724067585302e-06} [Rank 2] Trainer log: {'loss': 1.0233, 'grad_norm': 5.7810258865356445, 'learning_rate': 2.4646724067585302e-06} {'loss': 1.0233, 'grad_norm': 5.7810258865356445, 'learning_rate': 2.4646724067585302e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.9508, 'grad_norm': 3.9651806354522705, 'learning_rate': 2.460191164634651e-06}[Rank 0] Trainer log: {'loss': 0.9508, 'grad_norm': 3.9651806354522705, 'learning_rate': 2.460191164634651e-06}[Rank 1] Trainer log: {'loss': 0.9508, 'grad_norm': 3.9651806354522705, 'learning_rate': 2.460191164634651e-06} [Rank 3] Trainer log: {'loss': 0.9508, 'grad_norm': 3.9651806354522705, 'learning_rate': 2.460191164634651e-06} {'loss': 0.9508, 'grad_norm': 3.9651806354522705, 'learning_rate': 2.460191164634651e-06, 'epoch': 0.78} [Rank 1] Trainer log: {'loss': 0.8586, 'grad_norm': 3.799243688583374, 'learning_rate': 2.4557134286054486e-06}[Rank 3] Trainer log: {'loss': 0.8586, 'grad_norm': 3.799243688583374, 'learning_rate': 2.4557134286054486e-06}[Rank 0] Trainer log: {'loss': 0.8586, 'grad_norm': 3.799243688583374, 'learning_rate': 2.4557134286054486e-06} [Rank 2] Trainer log: {'loss': 0.8586, 'grad_norm': 3.799243688583374, 'learning_rate': 2.4557134286054486e-06} {'loss': 0.8586, 'grad_norm': 3.799243688583374, 'learning_rate': 2.4557134286054486e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.7694, 'grad_norm': 2.2742724418640137, 'learning_rate': 2.451239200753113e-06}[Rank 1] Trainer log: {'loss': 0.7694, 'grad_norm': 2.2742724418640137, 'learning_rate': 2.451239200753113e-06}[Rank 3] Trainer log: {'loss': 0.7694, 'grad_norm': 2.2742724418640137, 'learning_rate': 2.451239200753113e-06} [Rank 0] Trainer log: {'loss': 0.7694, 'grad_norm': 2.2742724418640137, 'learning_rate': 2.451239200753113e-06} {'loss': 0.7694, 'grad_norm': 2.2742724418640137, 'learning_rate': 2.451239200753113e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.8663, 'grad_norm': 10.911438941955566, 'learning_rate': 2.446768483158215e-06} [Rank 3] Trainer log: {'loss': 0.8663, 'grad_norm': 10.911438941955566, 'learning_rate': 2.446768483158215e-06} [Rank 0] Trainer log: {'loss': 0.8663, 'grad_norm': 10.911438941955566, 'learning_rate': 2.446768483158215e-06}[Rank 1] Trainer log: {'loss': 0.8663, 'grad_norm': 10.911438941955566, 'learning_rate': 2.446768483158215e-06} {'loss': 0.8663, 'grad_norm': 10.911438941955566, 'learning_rate': 2.446768483158215e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 1.0281, 'grad_norm': 6.320929050445557, 'learning_rate': 2.442301277899687e-06} [Rank 0] Trainer log: {'loss': 1.0281, 'grad_norm': 6.320929050445557, 'learning_rate': 2.442301277899687e-06} [Rank 2] Trainer log: {'loss': 1.0281, 'grad_norm': 6.320929050445557, 'learning_rate': 2.442301277899687e-06} [Rank 1] Trainer log: {'loss': 1.0281, 'grad_norm': 6.320929050445557, 'learning_rate': 2.442301277899687e-06} {'loss': 1.0281, 'grad_norm': 6.320929050445557, 'learning_rate': 2.442301277899687e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.7567, 'grad_norm': 8.90937328338623, 'learning_rate': 2.4378375870548253e-06} [Rank 0] Trainer log: {'loss': 0.7567, 'grad_norm': 8.90937328338623, 'learning_rate': 2.4378375870548253e-06}[Rank 3] Trainer log: {'loss': 0.7567, 'grad_norm': 8.90937328338623, 'learning_rate': 2.4378375870548253e-06} [Rank 1] Trainer log: {'loss': 0.7567, 'grad_norm': 8.90937328338623, 'learning_rate': 2.4378375870548253e-06} {'loss': 0.7567, 'grad_norm': 8.90937328338623, 'learning_rate': 2.4378375870548253e-06, 'epoch': 0.78} [Rank 3] Trainer log: {'loss': 0.8087, 'grad_norm': 4.827927589416504, 'learning_rate': 2.4333774126993003e-06}[Rank 2] Trainer log: {'loss': 0.8087, 'grad_norm': 4.827927589416504, 'learning_rate': 2.4333774126993003e-06} [Rank 1] Trainer log: {'loss': 0.8087, 'grad_norm': 4.827927589416504, 'learning_rate': 2.4333774126993003e-06} [Rank 0] Trainer log: {'loss': 0.8087, 'grad_norm': 4.827927589416504, 'learning_rate': 2.4333774126993003e-06} {'loss': 0.8087, 'grad_norm': 4.827927589416504, 'learning_rate': 2.4333774126993003e-06, 'epoch': 0.78} [Rank 1] Trainer log: {'loss': 0.8081, 'grad_norm': 4.51323127746582, 'learning_rate': 2.428920756907137e-06}[Rank 0] Trainer log: {'loss': 0.8081, 'grad_norm': 4.51323127746582, 'learning_rate': 2.428920756907137e-06} [Rank 3] Trainer log: {'loss': 0.8081, 'grad_norm': 4.51323127746582, 'learning_rate': 2.428920756907137e-06} [Rank 2] Trainer log: {'loss': 0.8081, 'grad_norm': 4.51323127746582, 'learning_rate': 2.428920756907137e-06} {'loss': 0.8081, 'grad_norm': 4.51323127746582, 'learning_rate': 2.428920756907137e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.5404, 'grad_norm': 1.6681126356124878, 'learning_rate': 2.4244676217507367e-06} [Rank 1] Trainer log: {'loss': 0.5404, 'grad_norm': 1.6681126356124878, 'learning_rate': 2.4244676217507367e-06}[Rank 3] Trainer log: {'loss': 0.5404, 'grad_norm': 1.6681126356124878, 'learning_rate': 2.4244676217507367e-06} [Rank 0] Trainer log: {'loss': 0.5404, 'grad_norm': 1.6681126356124878, 'learning_rate': 2.4244676217507367e-06} {'loss': 0.5404, 'grad_norm': 1.6681126356124878, 'learning_rate': 2.4244676217507367e-06, 'epoch': 0.78} [Rank 2] Trainer log: {'loss': 0.6288, 'grad_norm': 6.03494119644165, 'learning_rate': 2.4200180093008506e-06}[Rank 0] Trainer log: {'loss': 0.6288, 'grad_norm': 6.03494119644165, 'learning_rate': 2.4200180093008506e-06}[Rank 3] Trainer log: {'loss': 0.6288, 'grad_norm': 6.03494119644165, 'learning_rate': 2.4200180093008506e-06} [Rank 1] Trainer log: {'loss': 0.6288, 'grad_norm': 6.03494119644165, 'learning_rate': 2.4200180093008506e-06} {'loss': 0.6288, 'grad_norm': 6.03494119644165, 'learning_rate': 2.4200180093008506e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.9278, 'grad_norm': 4.989682674407959, 'learning_rate': 2.4155719216266027e-06}[Rank 1] Trainer log: {'loss': 0.9278, 'grad_norm': 4.989682674407959, 'learning_rate': 2.4155719216266027e-06}[Rank 0] Trainer log: {'loss': 0.9278, 'grad_norm': 4.989682674407959, 'learning_rate': 2.4155719216266027e-06} [Rank 2] Trainer log: {'loss': 0.9278, 'grad_norm': 4.989682674407959, 'learning_rate': 2.4155719216266027e-06} {'loss': 0.9278, 'grad_norm': 4.989682674407959, 'learning_rate': 2.4155719216266027e-06, 'epoch': 0.79} [Rank 1] Trainer log: {'loss': 0.8421, 'grad_norm': 5.56791353225708, 'learning_rate': 2.4111293607954676e-06}[Rank 3] Trainer log: {'loss': 0.8421, 'grad_norm': 5.56791353225708, 'learning_rate': 2.4111293607954676e-06}[Rank 0] Trainer log: {'loss': 0.8421, 'grad_norm': 5.56791353225708, 'learning_rate': 2.4111293607954676e-06} [Rank 2] Trainer log: {'loss': 0.8421, 'grad_norm': 5.56791353225708, 'learning_rate': 2.4111293607954676e-06} {'loss': 0.8421, 'grad_norm': 5.56791353225708, 'learning_rate': 2.4111293607954676e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.6986, 'grad_norm': 3.7678258419036865, 'learning_rate': 2.4066903288732912e-06}[Rank 1] Trainer log: {'loss': 0.6986, 'grad_norm': 3.7678258419036865, 'learning_rate': 2.4066903288732912e-06}[Rank 0] Trainer log: {'loss': 0.6986, 'grad_norm': 3.7678258419036865, 'learning_rate': 2.4066903288732912e-06} [Rank 2] Trainer log: {'loss': 0.6986, 'grad_norm': 3.7678258419036865, 'learning_rate': 2.4066903288732912e-06} {'loss': 0.6986, 'grad_norm': 3.7678258419036865, 'learning_rate': 2.4066903288732912e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.7313, 'grad_norm': 7.914790153503418, 'learning_rate': 2.4022548279242696e-06}[Rank 1] Trainer log: {'loss': 0.7313, 'grad_norm': 7.914790153503418, 'learning_rate': 2.4022548279242696e-06}[Rank 0] Trainer log: {'loss': 0.7313, 'grad_norm': 7.914790153503418, 'learning_rate': 2.4022548279242696e-06} [Rank 2] Trainer log: {'loss': 0.7313, 'grad_norm': 7.914790153503418, 'learning_rate': 2.4022548279242696e-06} {'loss': 0.7313, 'grad_norm': 7.914790153503418, 'learning_rate': 2.4022548279242696e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 1.0314, 'grad_norm': 12.078778266906738, 'learning_rate': 2.3978228600109564e-06}[Rank 3] Trainer log: {'loss': 1.0314, 'grad_norm': 12.078778266906738, 'learning_rate': 2.3978228600109564e-06}[Rank 1] Trainer log: {'loss': 1.0314, 'grad_norm': 12.078778266906738, 'learning_rate': 2.3978228600109564e-06} [Rank 2] Trainer log: {'loss': 1.0314, 'grad_norm': 12.078778266906738, 'learning_rate': 2.3978228600109564e-06} {'loss': 1.0314, 'grad_norm': 12.078778266906738, 'learning_rate': 2.3978228600109564e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.8346, 'grad_norm': 3.110431671142578, 'learning_rate': 2.3933944271942734e-06}[Rank 3] Trainer log: {'loss': 0.8346, 'grad_norm': 3.110431671142578, 'learning_rate': 2.3933944271942734e-06} [Rank 1] Trainer log: {'loss': 0.8346, 'grad_norm': 3.110431671142578, 'learning_rate': 2.3933944271942734e-06} [Rank 2] Trainer log: {'loss': 0.8346, 'grad_norm': 3.110431671142578, 'learning_rate': 2.3933944271942734e-06} {'loss': 0.8346, 'grad_norm': 3.110431671142578, 'learning_rate': 2.3933944271942734e-06, 'epoch': 0.79} [Rank 2] Trainer log: {'loss': 0.7319, 'grad_norm': 7.816564559936523, 'learning_rate': 2.3889695315334826e-06}[Rank 3] Trainer log: {'loss': 0.7319, 'grad_norm': 7.816564559936523, 'learning_rate': 2.3889695315334826e-06}[Rank 0] Trainer log: {'loss': 0.7319, 'grad_norm': 7.816564559936523, 'learning_rate': 2.3889695315334826e-06} [Rank 1] Trainer log: {'loss': 0.7319, 'grad_norm': 7.816564559936523, 'learning_rate': 2.3889695315334826e-06} {'loss': 0.7319, 'grad_norm': 7.816564559936523, 'learning_rate': 2.3889695315334826e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.6725, 'grad_norm': 4.481846809387207, 'learning_rate': 2.3845481750862175e-06}[Rank 1] Trainer log: {'loss': 0.6725, 'grad_norm': 4.481846809387207, 'learning_rate': 2.3845481750862175e-06} [Rank 0] Trainer log: {'loss': 0.6725, 'grad_norm': 4.481846809387207, 'learning_rate': 2.3845481750862175e-06} [Rank 2] Trainer log: {'loss': 0.6725, 'grad_norm': 4.481846809387207, 'learning_rate': 2.3845481750862175e-06} {'loss': 0.6725, 'grad_norm': 4.481846809387207, 'learning_rate': 2.3845481750862175e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.8085, 'grad_norm': 4.863733291625977, 'learning_rate': 2.3801303599084513e-06}[Rank 0] Trainer log: {'loss': 0.8085, 'grad_norm': 4.863733291625977, 'learning_rate': 2.3801303599084513e-06}[Rank 2] Trainer log: {'loss': 0.8085, 'grad_norm': 4.863733291625977, 'learning_rate': 2.3801303599084513e-06} [Rank 1] Trainer log: {'loss': 0.8085, 'grad_norm': 4.863733291625977, 'learning_rate': 2.3801303599084513e-06} {'loss': 0.8085, 'grad_norm': 4.863733291625977, 'learning_rate': 2.3801303599084513e-06, 'epoch': 0.79} [Rank 1] Trainer log: {'loss': 0.7262, 'grad_norm': 7.149824142456055, 'learning_rate': 2.3757160880545225e-06} [Rank 0] Trainer log: {'loss': 0.7262, 'grad_norm': 7.149824142456055, 'learning_rate': 2.3757160880545225e-06}[Rank 2] Trainer log: {'loss': 0.7262, 'grad_norm': 7.149824142456055, 'learning_rate': 2.3757160880545225e-06} [Rank 3] Trainer log: {'loss': 0.7262, 'grad_norm': 7.149824142456055, 'learning_rate': 2.3757160880545225e-06} {'loss': 0.7262, 'grad_norm': 7.149824142456055, 'learning_rate': 2.3757160880545225e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 1.0517, 'grad_norm': 2.1120941638946533, 'learning_rate': 2.3713053615771144e-06}[Rank 3] Trainer log: {'loss': 1.0517, 'grad_norm': 2.1120941638946533, 'learning_rate': 2.3713053615771144e-06}[Rank 1] Trainer log: {'loss': 1.0517, 'grad_norm': 2.1120941638946533, 'learning_rate': 2.3713053615771144e-06} [Rank 2] Trainer log: {'loss': 1.0517, 'grad_norm': 2.1120941638946533, 'learning_rate': 2.3713053615771144e-06} {'loss': 1.0517, 'grad_norm': 2.1120941638946533, 'learning_rate': 2.3713053615771144e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.6905, 'grad_norm': 2.3337161540985107, 'learning_rate': 2.3668981825272595e-06}[Rank 3] Trainer log: {'loss': 0.6905, 'grad_norm': 2.3337161540985107, 'learning_rate': 2.3668981825272595e-06} [Rank 1] Trainer log: {'loss': 0.6905, 'grad_norm': 2.3337161540985107, 'learning_rate': 2.3668981825272595e-06} [Rank 2] Trainer log: {'loss': 0.6905, 'grad_norm': 2.3337161540985107, 'learning_rate': 2.3668981825272595e-06} {'loss': 0.6905, 'grad_norm': 2.3337161540985107, 'learning_rate': 2.3668981825272595e-06, 'epoch': 0.79} [Rank 1] Trainer log: {'loss': 0.9769, 'grad_norm': 1.8375239372253418, 'learning_rate': 2.362494552954352e-06}[Rank 3] Trainer log: {'loss': 0.9769, 'grad_norm': 1.8375239372253418, 'learning_rate': 2.362494552954352e-06} [Rank 2] Trainer log: {'loss': 0.9769, 'grad_norm': 1.8375239372253418, 'learning_rate': 2.362494552954352e-06} [Rank 0] Trainer log: {'loss': 0.9769, 'grad_norm': 1.8375239372253418, 'learning_rate': 2.362494552954352e-06} {'loss': 0.9769, 'grad_norm': 1.8375239372253418, 'learning_rate': 2.362494552954352e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.8901, 'grad_norm': 2.8714759349823, 'learning_rate': 2.358094474906124e-06}[Rank 3] Trainer log: {'loss': 0.8901, 'grad_norm': 2.8714759349823, 'learning_rate': 2.358094474906124e-06}[Rank 1] Trainer log: {'loss': 0.8901, 'grad_norm': 2.8714759349823, 'learning_rate': 2.358094474906124e-06} [Rank 2] Trainer log: {'loss': 0.8901, 'grad_norm': 2.8714759349823, 'learning_rate': 2.358094474906124e-06} {'loss': 0.8901, 'grad_norm': 2.8714759349823, 'learning_rate': 2.358094474906124e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.7717, 'grad_norm': 7.519294738769531, 'learning_rate': 2.3536979504286596e-06}[Rank 0] Trainer log: {'loss': 0.7717, 'grad_norm': 7.519294738769531, 'learning_rate': 2.3536979504286596e-06}[Rank 2] Trainer log: {'loss': 0.7717, 'grad_norm': 7.519294738769531, 'learning_rate': 2.3536979504286596e-06} [Rank 1] Trainer log: {'loss': 0.7717, 'grad_norm': 7.519294738769531, 'learning_rate': 2.3536979504286596e-06} {'loss': 0.7717, 'grad_norm': 7.519294738769531, 'learning_rate': 2.3536979504286596e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.8722, 'grad_norm': 5.960099220275879, 'learning_rate': 2.3493049815663926e-06}[Rank 2] Trainer log: {'loss': 0.8722, 'grad_norm': 5.960099220275879, 'learning_rate': 2.3493049815663926e-06} [Rank 0] Trainer log: {'loss': 0.8722, 'grad_norm': 5.960099220275879, 'learning_rate': 2.3493049815663926e-06} [Rank 1] Trainer log: {'loss': 0.8722, 'grad_norm': 5.960099220275879, 'learning_rate': 2.3493049815663926e-06} {'loss': 0.8722, 'grad_norm': 5.960099220275879, 'learning_rate': 2.3493049815663926e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.6339, 'grad_norm': 10.307536125183105, 'learning_rate': 2.3449155703621063e-06}[Rank 0] Trainer log: {'loss': 0.6339, 'grad_norm': 10.307536125183105, 'learning_rate': 2.3449155703621063e-06} [Rank 2] Trainer log: {'loss': 0.6339, 'grad_norm': 10.307536125183105, 'learning_rate': 2.3449155703621063e-06} [Rank 1] Trainer log: {'loss': 0.6339, 'grad_norm': 10.307536125183105, 'learning_rate': 2.3449155703621063e-06} {'loss': 0.6339, 'grad_norm': 10.307536125183105, 'learning_rate': 2.3449155703621063e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.9099, 'grad_norm': 7.540370464324951, 'learning_rate': 2.3405297188569186e-06}[Rank 0] Trainer log: {'loss': 0.9099, 'grad_norm': 7.540370464324951, 'learning_rate': 2.3405297188569186e-06}[Rank 1] Trainer log: {'loss': 0.9099, 'grad_norm': 7.540370464324951, 'learning_rate': 2.3405297188569186e-06} [Rank 2] Trainer log: {'loss': 0.9099, 'grad_norm': 7.540370464324951, 'learning_rate': 2.3405297188569186e-06} {'loss': 0.9099, 'grad_norm': 7.540370464324951, 'learning_rate': 2.3405297188569186e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.6518, 'grad_norm': 6.7511820793151855, 'learning_rate': 2.3361474290903064e-06}[Rank 3] Trainer log: {'loss': 0.6518, 'grad_norm': 6.7511820793151855, 'learning_rate': 2.3361474290903064e-06} [Rank 2] Trainer log: {'loss': 0.6518, 'grad_norm': 6.7511820793151855, 'learning_rate': 2.3361474290903064e-06} [Rank 1] Trainer log: {'loss': 0.6518, 'grad_norm': 6.7511820793151855, 'learning_rate': 2.3361474290903064e-06} {'loss': 0.6518, 'grad_norm': 6.7511820793151855, 'learning_rate': 2.3361474290903064e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.8652, 'grad_norm': 6.455410003662109, 'learning_rate': 2.331768703100079e-06}[Rank 0] Trainer log: {'loss': 0.8652, 'grad_norm': 6.455410003662109, 'learning_rate': 2.331768703100079e-06} [Rank 1] Trainer log: {'loss': 0.8652, 'grad_norm': 6.455410003662109, 'learning_rate': 2.331768703100079e-06} [Rank 2] Trainer log: {'loss': 0.8652, 'grad_norm': 6.455410003662109, 'learning_rate': 2.331768703100079e-06} {'loss': 0.8652, 'grad_norm': 6.455410003662109, 'learning_rate': 2.331768703100079e-06, 'epoch': 0.79} [Rank 2] Trainer log: {'loss': 0.5021, 'grad_norm': 10.380352973937988, 'learning_rate': 2.3273935429223893e-06} [Rank 1] Trainer log: {'loss': 0.5021, 'grad_norm': 10.380352973937988, 'learning_rate': 2.3273935429223893e-06} [Rank 3] Trainer log: {'loss': 0.5021, 'grad_norm': 10.380352973937988, 'learning_rate': 2.3273935429223893e-06} [Rank 0] Trainer log: {'loss': 0.5021, 'grad_norm': 10.380352973937988, 'learning_rate': 2.3273935429223893e-06} {'loss': 0.5021, 'grad_norm': 10.380352973937988, 'learning_rate': 2.3273935429223893e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.7545, 'grad_norm': 2.6335155963897705, 'learning_rate': 2.323021950591743e-06}[Rank 3] Trainer log: {'loss': 0.7545, 'grad_norm': 2.6335155963897705, 'learning_rate': 2.323021950591743e-06}[Rank 1] Trainer log: {'loss': 0.7545, 'grad_norm': 2.6335155963897705, 'learning_rate': 2.323021950591743e-06} [Rank 2] Trainer log: {'loss': 0.7545, 'grad_norm': 2.6335155963897705, 'learning_rate': 2.323021950591743e-06} {'loss': 0.7545, 'grad_norm': 2.6335155963897705, 'learning_rate': 2.323021950591743e-06, 'epoch': 0.79} [Rank 1] Trainer log: {'loss': 0.8674, 'grad_norm': 4.903423309326172, 'learning_rate': 2.3186539281409735e-06} [Rank 0] Trainer log: {'loss': 0.8674, 'grad_norm': 4.903423309326172, 'learning_rate': 2.3186539281409735e-06}[Rank 3] Trainer log: {'loss': 0.8674, 'grad_norm': 4.903423309326172, 'learning_rate': 2.3186539281409735e-06} [Rank 2] Trainer log: {'loss': 0.8674, 'grad_norm': 4.903423309326172, 'learning_rate': 2.3186539281409735e-06} {'loss': 0.8674, 'grad_norm': 4.903423309326172, 'learning_rate': 2.3186539281409735e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.834, 'grad_norm': 9.543343544006348, 'learning_rate': 2.314289477601258e-06}[Rank 0] Trainer log: {'loss': 0.834, 'grad_norm': 9.543343544006348, 'learning_rate': 2.314289477601258e-06} [Rank 1] Trainer log: {'loss': 0.834, 'grad_norm': 9.543343544006348, 'learning_rate': 2.314289477601258e-06} [Rank 2] Trainer log: {'loss': 0.834, 'grad_norm': 9.543343544006348, 'learning_rate': 2.314289477601258e-06} {'loss': 0.834, 'grad_norm': 9.543343544006348, 'learning_rate': 2.314289477601258e-06, 'epoch': 0.79} [Rank 1] Trainer log: {'loss': 0.7516, 'grad_norm': 7.98604154586792, 'learning_rate': 2.309928601002117e-06}[Rank 3] Trainer log: {'loss': 0.7516, 'grad_norm': 7.98604154586792, 'learning_rate': 2.309928601002117e-06}[Rank 0] Trainer log: {'loss': 0.7516, 'grad_norm': 7.98604154586792, 'learning_rate': 2.309928601002117e-06} [Rank 2] Trainer log: {'loss': 0.7516, 'grad_norm': 7.98604154586792, 'learning_rate': 2.309928601002117e-06} {'loss': 0.7516, 'grad_norm': 7.98604154586792, 'learning_rate': 2.309928601002117e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.7146, 'grad_norm': 1.685324788093567, 'learning_rate': 2.305571300371411e-06} [Rank 0] Trainer log: {'loss': 0.7146, 'grad_norm': 1.685324788093567, 'learning_rate': 2.305571300371411e-06}[Rank 2] Trainer log: {'loss': 0.7146, 'grad_norm': 1.685324788093567, 'learning_rate': 2.305571300371411e-06} [Rank 1] Trainer log: {'loss': 0.7146, 'grad_norm': 1.685324788093567, 'learning_rate': 2.305571300371411e-06} {'loss': 0.7146, 'grad_norm': 1.685324788093567, 'learning_rate': 2.305571300371411e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.9264, 'grad_norm': 2.5673484802246094, 'learning_rate': 2.301217577735325e-06}[Rank 1] Trainer log: {'loss': 0.9264, 'grad_norm': 2.5673484802246094, 'learning_rate': 2.301217577735325e-06} [Rank 2] Trainer log: {'loss': 0.9264, 'grad_norm': 2.5673484802246094, 'learning_rate': 2.301217577735325e-06} [Rank 0] Trainer log: {'loss': 0.9264, 'grad_norm': 2.5673484802246094, 'learning_rate': 2.301217577735325e-06} {'loss': 0.9264, 'grad_norm': 2.5673484802246094, 'learning_rate': 2.301217577735325e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.8127, 'grad_norm': 4.4436516761779785, 'learning_rate': 2.296867435118395e-06}[Rank 2] Trainer log: {'loss': 0.8127, 'grad_norm': 4.4436516761779785, 'learning_rate': 2.296867435118395e-06}[Rank 1] Trainer log: {'loss': 0.8127, 'grad_norm': 4.4436516761779785, 'learning_rate': 2.296867435118395e-06} [Rank 3] Trainer log: {'loss': 0.8127, 'grad_norm': 4.4436516761779785, 'learning_rate': 2.296867435118395e-06} {'loss': 0.8127, 'grad_norm': 4.4436516761779785, 'learning_rate': 2.296867435118395e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.9094, 'grad_norm': 5.9078803062438965, 'learning_rate': 2.2925208745434824e-06}[Rank 2] Trainer log: {'loss': 0.9094, 'grad_norm': 5.9078803062438965, 'learning_rate': 2.2925208745434824e-06}[Rank 1] Trainer log: {'loss': 0.9094, 'grad_norm': 5.9078803062438965, 'learning_rate': 2.2925208745434824e-06} [Rank 3] Trainer log: {'loss': 0.9094, 'grad_norm': 5.9078803062438965, 'learning_rate': 2.2925208745434824e-06} {'loss': 0.9094, 'grad_norm': 5.9078803062438965, 'learning_rate': 2.2925208745434824e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.6737, 'grad_norm': 4.193510055541992, 'learning_rate': 2.2881778980317837e-06} [Rank 2] Trainer log: {'loss': 0.6737, 'grad_norm': 4.193510055541992, 'learning_rate': 2.2881778980317837e-06} [Rank 0] Trainer log: {'loss': 0.6737, 'grad_norm': 4.193510055541992, 'learning_rate': 2.2881778980317837e-06}[Rank 1] Trainer log: {'loss': 0.6737, 'grad_norm': 4.193510055541992, 'learning_rate': 2.2881778980317837e-06} {'loss': 0.6737, 'grad_norm': 4.193510055541992, 'learning_rate': 2.2881778980317837e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.9096, 'grad_norm': 6.102855682373047, 'learning_rate': 2.283838507602837e-06}[Rank 1] Trainer log: {'loss': 0.9096, 'grad_norm': 6.102855682373047, 'learning_rate': 2.283838507602837e-06}[Rank 0] Trainer log: {'loss': 0.9096, 'grad_norm': 6.102855682373047, 'learning_rate': 2.283838507602837e-06} [Rank 2] Trainer log: {'loss': 0.9096, 'grad_norm': 6.102855682373047, 'learning_rate': 2.283838507602837e-06} {'loss': 0.9096, 'grad_norm': 6.102855682373047, 'learning_rate': 2.283838507602837e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.938, 'grad_norm': 2.0788352489471436, 'learning_rate': 2.279502705274502e-06}[Rank 1] Trainer log: {'loss': 0.938, 'grad_norm': 2.0788352489471436, 'learning_rate': 2.279502705274502e-06}[Rank 3] Trainer log: {'loss': 0.938, 'grad_norm': 2.0788352489471436, 'learning_rate': 2.279502705274502e-06} [Rank 2] Trainer log: {'loss': 0.938, 'grad_norm': 2.0788352489471436, 'learning_rate': 2.279502705274502e-06} {'loss': 0.938, 'grad_norm': 2.0788352489471436, 'learning_rate': 2.279502705274502e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.878, 'grad_norm': 9.146293640136719, 'learning_rate': 2.2751704930629746e-06}[Rank 1] Trainer log: {'loss': 0.878, 'grad_norm': 9.146293640136719, 'learning_rate': 2.2751704930629746e-06} [Rank 0] Trainer log: {'loss': 0.878, 'grad_norm': 9.146293640136719, 'learning_rate': 2.2751704930629746e-06} [Rank 2] Trainer log: {'loss': 0.878, 'grad_norm': 9.146293640136719, 'learning_rate': 2.2751704930629746e-06} {'loss': 0.878, 'grad_norm': 9.146293640136719, 'learning_rate': 2.2751704930629746e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.8123, 'grad_norm': 1.9247523546218872, 'learning_rate': 2.270841872982784e-06}[Rank 0] Trainer log: {'loss': 0.8123, 'grad_norm': 1.9247523546218872, 'learning_rate': 2.270841872982784e-06} [Rank 1] Trainer log: {'loss': 0.8123, 'grad_norm': 1.9247523546218872, 'learning_rate': 2.270841872982784e-06} [Rank 2] Trainer log: {'loss': 0.8123, 'grad_norm': 1.9247523546218872, 'learning_rate': 2.270841872982784e-06} {'loss': 0.8123, 'grad_norm': 1.9247523546218872, 'learning_rate': 2.270841872982784e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.6813, 'grad_norm': 10.166728019714355, 'learning_rate': 2.266516847046788e-06}[Rank 2] Trainer log: {'loss': 0.6813, 'grad_norm': 10.166728019714355, 'learning_rate': 2.266516847046788e-06}[Rank 1] Trainer log: {'loss': 0.6813, 'grad_norm': 10.166728019714355, 'learning_rate': 2.266516847046788e-06} [Rank 3] Trainer log: {'loss': 0.6813, 'grad_norm': 10.166728019714355, 'learning_rate': 2.266516847046788e-06} {'loss': 0.6813, 'grad_norm': 10.166728019714355, 'learning_rate': 2.266516847046788e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.9121, 'grad_norm': 7.308281421661377, 'learning_rate': 2.26219541726617e-06} [Rank 0] Trainer log: {'loss': 0.9121, 'grad_norm': 7.308281421661377, 'learning_rate': 2.26219541726617e-06}[Rank 1] Trainer log: {'loss': 0.9121, 'grad_norm': 7.308281421661377, 'learning_rate': 2.26219541726617e-06} [Rank 2] Trainer log: {'loss': 0.9121, 'grad_norm': 7.308281421661377, 'learning_rate': 2.26219541726617e-06} {'loss': 0.9121, 'grad_norm': 7.308281421661377, 'learning_rate': 2.26219541726617e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.6996, 'grad_norm': 4.811783790588379, 'learning_rate': 2.2578775856504386e-06}[Rank 2] Trainer log: {'loss': 0.6996, 'grad_norm': 4.811783790588379, 'learning_rate': 2.2578775856504386e-06} [Rank 3] Trainer log: {'loss': 0.6996, 'grad_norm': 4.811783790588379, 'learning_rate': 2.2578775856504386e-06} [Rank 1] Trainer log: {'loss': 0.6996, 'grad_norm': 4.811783790588379, 'learning_rate': 2.2578775856504386e-06} {'loss': 0.6996, 'grad_norm': 4.811783790588379, 'learning_rate': 2.2578775856504386e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.8213, 'grad_norm': 1.8516058921813965, 'learning_rate': 2.25356335420744e-06}[Rank 3] Trainer log: {'loss': 0.8213, 'grad_norm': 1.8516058921813965, 'learning_rate': 2.25356335420744e-06} [Rank 1] Trainer log: {'loss': 0.8213, 'grad_norm': 1.8516058921813965, 'learning_rate': 2.25356335420744e-06} [Rank 2] Trainer log: {'loss': 0.8213, 'grad_norm': 1.8516058921813965, 'learning_rate': 2.25356335420744e-06} {'loss': 0.8213, 'grad_norm': 1.8516058921813965, 'learning_rate': 2.25356335420744e-06, 'epoch': 0.79} [Rank 2] Trainer log: {'loss': 0.8714, 'grad_norm': 2.152378797531128, 'learning_rate': 2.249252724943336e-06}[Rank 3] Trainer log: {'loss': 0.8714, 'grad_norm': 2.152378797531128, 'learning_rate': 2.249252724943336e-06} [Rank 1] Trainer log: {'loss': 0.8714, 'grad_norm': 2.152378797531128, 'learning_rate': 2.249252724943336e-06} [Rank 0] Trainer log: {'loss': 0.8714, 'grad_norm': 2.152378797531128, 'learning_rate': 2.249252724943336e-06} {'loss': 0.8714, 'grad_norm': 2.152378797531128, 'learning_rate': 2.249252724943336e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.818, 'grad_norm': 8.979559898376465, 'learning_rate': 2.244945699862615e-06}[Rank 1] Trainer log: {'loss': 0.818, 'grad_norm': 8.979559898376465, 'learning_rate': 2.244945699862615e-06} [Rank 2] Trainer log: {'loss': 0.818, 'grad_norm': 8.979559898376465, 'learning_rate': 2.244945699862615e-06} [Rank 0] Trainer log: {'loss': 0.818, 'grad_norm': 8.979559898376465, 'learning_rate': 2.244945699862615e-06} {'loss': 0.818, 'grad_norm': 8.979559898376465, 'learning_rate': 2.244945699862615e-06, 'epoch': 0.79} [Rank 2] Trainer log: {'loss': 0.9316, 'grad_norm': 4.433457374572754, 'learning_rate': 2.240642280968097e-06}[Rank 3] Trainer log: {'loss': 0.9316, 'grad_norm': 4.433457374572754, 'learning_rate': 2.240642280968097e-06}[Rank 0] Trainer log: {'loss': 0.9316, 'grad_norm': 4.433457374572754, 'learning_rate': 2.240642280968097e-06} [Rank 1] Trainer log: {'loss': 0.9316, 'grad_norm': 4.433457374572754, 'learning_rate': 2.240642280968097e-06} {'loss': 0.9316, 'grad_norm': 4.433457374572754, 'learning_rate': 2.240642280968097e-06, 'epoch': 0.79} [Rank 2] Trainer log: {'loss': 0.8713, 'grad_norm': 8.339127540588379, 'learning_rate': 2.236342470260914e-06}[Rank 3] Trainer log: {'loss': 0.8713, 'grad_norm': 8.339127540588379, 'learning_rate': 2.236342470260914e-06}[Rank 0] Trainer log: {'loss': 0.8713, 'grad_norm': 8.339127540588379, 'learning_rate': 2.236342470260914e-06} [Rank 1] Trainer log: {'loss': 0.8713, 'grad_norm': 8.339127540588379, 'learning_rate': 2.236342470260914e-06} {'loss': 0.8713, 'grad_norm': 8.339127540588379, 'learning_rate': 2.236342470260914e-06, 'epoch': 0.79} [Rank 2] Trainer log: {'loss': 0.9152, 'grad_norm': 3.296705961227417, 'learning_rate': 2.2320462697405274e-06}[Rank 3] Trainer log: {'loss': 0.9152, 'grad_norm': 3.296705961227417, 'learning_rate': 2.2320462697405274e-06}[Rank 1] Trainer log: {'loss': 0.9152, 'grad_norm': 3.296705961227417, 'learning_rate': 2.2320462697405274e-06} [Rank 0] Trainer log: {'loss': 0.9152, 'grad_norm': 3.296705961227417, 'learning_rate': 2.2320462697405274e-06} {'loss': 0.9152, 'grad_norm': 3.296705961227417, 'learning_rate': 2.2320462697405274e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.9067, 'grad_norm': 5.838363170623779, 'learning_rate': 2.2277536814047218e-06} [Rank 2] Trainer log: {'loss': 0.9067, 'grad_norm': 5.838363170623779, 'learning_rate': 2.2277536814047218e-06} [Rank 0] Trainer log: {'loss': 0.9067, 'grad_norm': 5.838363170623779, 'learning_rate': 2.2277536814047218e-06}[Rank 1] Trainer log: {'loss': 0.9067, 'grad_norm': 5.838363170623779, 'learning_rate': 2.2277536814047218e-06} {'loss': 0.9067, 'grad_norm': 5.838363170623779, 'learning_rate': 2.2277536814047218e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.7529, 'grad_norm': 4.262943744659424, 'learning_rate': 2.223464707249595e-06}[Rank 1] Trainer log: {'loss': 0.7529, 'grad_norm': 4.262943744659424, 'learning_rate': 2.223464707249595e-06}[Rank 0] Trainer log: {'loss': 0.7529, 'grad_norm': 4.262943744659424, 'learning_rate': 2.223464707249595e-06} [Rank 2] Trainer log: {'loss': 0.7529, 'grad_norm': 4.262943744659424, 'learning_rate': 2.223464707249595e-06} {'loss': 0.7529, 'grad_norm': 4.262943744659424, 'learning_rate': 2.223464707249595e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.8417, 'grad_norm': 7.999035835266113, 'learning_rate': 2.2191793492695655e-06}[Rank 0] Trainer log: {'loss': 0.8417, 'grad_norm': 7.999035835266113, 'learning_rate': 2.2191793492695655e-06} [Rank 1] Trainer log: {'loss': 0.8417, 'grad_norm': 7.999035835266113, 'learning_rate': 2.2191793492695655e-06} [Rank 2] Trainer log: {'loss': 0.8417, 'grad_norm': 7.999035835266113, 'learning_rate': 2.2191793492695655e-06} {'loss': 0.8417, 'grad_norm': 7.999035835266113, 'learning_rate': 2.2191793492695655e-06, 'epoch': 0.79} [Rank 2] Trainer log: {'loss': 0.8448, 'grad_norm': 9.12429141998291, 'learning_rate': 2.214897609457377e-06}[Rank 0] Trainer log: {'loss': 0.8448, 'grad_norm': 9.12429141998291, 'learning_rate': 2.214897609457377e-06} [Rank 1] Trainer log: {'loss': 0.8448, 'grad_norm': 9.12429141998291, 'learning_rate': 2.214897609457377e-06} [Rank 3] Trainer log: {'loss': 0.8448, 'grad_norm': 9.12429141998291, 'learning_rate': 2.214897609457377e-06} {'loss': 0.8448, 'grad_norm': 9.12429141998291, 'learning_rate': 2.214897609457377e-06, 'epoch': 0.79} [Rank 0] Trainer log: {'loss': 0.5913, 'grad_norm': 3.977261781692505, 'learning_rate': 2.2106194898040845e-06}[Rank 3] Trainer log: {'loss': 0.5913, 'grad_norm': 3.977261781692505, 'learning_rate': 2.2106194898040845e-06} [Rank 1] Trainer log: {'loss': 0.5913, 'grad_norm': 3.977261781692505, 'learning_rate': 2.2106194898040845e-06} [Rank 2] Trainer log: {'loss': 0.5913, 'grad_norm': 3.977261781692505, 'learning_rate': 2.2106194898040845e-06} {'loss': 0.5913, 'grad_norm': 3.977261781692505, 'learning_rate': 2.2106194898040845e-06, 'epoch': 0.79} [Rank 3] Trainer log: {'loss': 0.696, 'grad_norm': 3.51308274269104, 'learning_rate': 2.2063449922990565e-06}[Rank 2] Trainer log: {'loss': 0.696, 'grad_norm': 3.51308274269104, 'learning_rate': 2.2063449922990565e-06} [Rank 0] Trainer log: {'loss': 0.696, 'grad_norm': 3.51308274269104, 'learning_rate': 2.2063449922990565e-06}[Rank 1] Trainer log: {'loss': 0.696, 'grad_norm': 3.51308274269104, 'learning_rate': 2.2063449922990565e-06} {'loss': 0.696, 'grad_norm': 3.51308274269104, 'learning_rate': 2.2063449922990565e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.9664, 'grad_norm': 2.1042442321777344, 'learning_rate': 2.2020741189299875e-06} [Rank 3] Trainer log: {'loss': 0.9664, 'grad_norm': 2.1042442321777344, 'learning_rate': 2.2020741189299875e-06} [Rank 0] Trainer log: {'loss': 0.9664, 'grad_norm': 2.1042442321777344, 'learning_rate': 2.2020741189299875e-06}[Rank 2] Trainer log: {'loss': 0.9664, 'grad_norm': 2.1042442321777344, 'learning_rate': 2.2020741189299875e-06} {'loss': 0.9664, 'grad_norm': 2.1042442321777344, 'learning_rate': 2.2020741189299875e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.6193, 'grad_norm': 4.078712463378906, 'learning_rate': 2.197806871682877e-06}[Rank 3] Trainer log: {'loss': 0.6193, 'grad_norm': 4.078712463378906, 'learning_rate': 2.197806871682877e-06} [Rank 0] Trainer log: {'loss': 0.6193, 'grad_norm': 4.078712463378906, 'learning_rate': 2.197806871682877e-06} [Rank 2] Trainer log: {'loss': 0.6193, 'grad_norm': 4.078712463378906, 'learning_rate': 2.197806871682877e-06} {'loss': 0.6193, 'grad_norm': 4.078712463378906, 'learning_rate': 2.197806871682877e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.7818, 'grad_norm': 3.7753565311431885, 'learning_rate': 2.193543252542043e-06}[Rank 0] Trainer log: {'loss': 0.7818, 'grad_norm': 3.7753565311431885, 'learning_rate': 2.193543252542043e-06}[Rank 1] Trainer log: {'loss': 0.7818, 'grad_norm': 3.7753565311431885, 'learning_rate': 2.193543252542043e-06} [Rank 2] Trainer log: {'loss': 0.7818, 'grad_norm': 3.7753565311431885, 'learning_rate': 2.193543252542043e-06} {'loss': 0.7818, 'grad_norm': 3.7753565311431885, 'learning_rate': 2.193543252542043e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 0.665, 'grad_norm': 2.934488296508789, 'learning_rate': 2.1892832634901194e-06}[Rank 1] Trainer log: {'loss': 0.665, 'grad_norm': 2.934488296508789, 'learning_rate': 2.1892832634901194e-06}[Rank 3] Trainer log: {'loss': 0.665, 'grad_norm': 2.934488296508789, 'learning_rate': 2.1892832634901194e-06} [Rank 0] Trainer log: {'loss': 0.665, 'grad_norm': 2.934488296508789, 'learning_rate': 2.1892832634901194e-06} {'loss': 0.665, 'grad_norm': 2.934488296508789, 'learning_rate': 2.1892832634901194e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.6666, 'grad_norm': 2.8472530841827393, 'learning_rate': 2.1850269065080464e-06}[Rank 1] Trainer log: {'loss': 0.6666, 'grad_norm': 2.8472530841827393, 'learning_rate': 2.1850269065080464e-06} [Rank 3] Trainer log: {'loss': 0.6666, 'grad_norm': 2.8472530841827393, 'learning_rate': 2.1850269065080464e-06} [Rank 2] Trainer log: {'loss': 0.6666, 'grad_norm': 2.8472530841827393, 'learning_rate': 2.1850269065080464e-06} {'loss': 0.6666, 'grad_norm': 2.8472530841827393, 'learning_rate': 2.1850269065080464e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.8245, 'grad_norm': 2.478281259536743, 'learning_rate': 2.180774183575074e-06}[Rank 2] Trainer log: {'loss': 0.8245, 'grad_norm': 2.478281259536743, 'learning_rate': 2.180774183575074e-06} [Rank 3] Trainer log: {'loss': 0.8245, 'grad_norm': 2.478281259536743, 'learning_rate': 2.180774183575074e-06} [Rank 0] Trainer log: {'loss': 0.8245, 'grad_norm': 2.478281259536743, 'learning_rate': 2.180774183575074e-06} {'loss': 0.8245, 'grad_norm': 2.478281259536743, 'learning_rate': 2.180774183575074e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 1.0914, 'grad_norm': 2.076280117034912, 'learning_rate': 2.176525096668769e-06} [Rank 1] Trainer log: {'loss': 1.0914, 'grad_norm': 2.076280117034912, 'learning_rate': 2.176525096668769e-06}[Rank 2] Trainer log: {'loss': 1.0914, 'grad_norm': 2.076280117034912, 'learning_rate': 2.176525096668769e-06} [Rank 0] Trainer log: {'loss': 1.0914, 'grad_norm': 2.076280117034912, 'learning_rate': 2.176525096668769e-06} {'loss': 1.0914, 'grad_norm': 2.076280117034912, 'learning_rate': 2.176525096668769e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.7743, 'grad_norm': 8.257143020629883, 'learning_rate': 2.1722796477650045e-06}[Rank 3] Trainer log: {'loss': 0.7743, 'grad_norm': 8.257143020629883, 'learning_rate': 2.1722796477650045e-06} [Rank 2] Trainer log: {'loss': 0.7743, 'grad_norm': 8.257143020629883, 'learning_rate': 2.1722796477650045e-06} [Rank 0] Trainer log: {'loss': 0.7743, 'grad_norm': 8.257143020629883, 'learning_rate': 2.1722796477650045e-06} {'loss': 0.7743, 'grad_norm': 8.257143020629883, 'learning_rate': 2.1722796477650045e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.9278, 'grad_norm': 3.2672717571258545, 'learning_rate': 2.1680378388379554e-06} [Rank 2] Trainer log: {'loss': 0.9278, 'grad_norm': 3.2672717571258545, 'learning_rate': 2.1680378388379554e-06} [Rank 1] Trainer log: {'loss': 0.9278, 'grad_norm': 3.2672717571258545, 'learning_rate': 2.1680378388379554e-06} [Rank 0] Trainer log: {'loss': 0.9278, 'grad_norm': 3.2672717571258545, 'learning_rate': 2.1680378388379554e-06} {'loss': 0.9278, 'grad_norm': 3.2672717571258545, 'learning_rate': 2.1680378388379554e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.8489, 'grad_norm': 4.128507137298584, 'learning_rate': 2.163799671860117e-06} [Rank 1] Trainer log: {'loss': 0.8489, 'grad_norm': 4.128507137298584, 'learning_rate': 2.163799671860117e-06}[Rank 0] Trainer log: {'loss': 0.8489, 'grad_norm': 4.128507137298584, 'learning_rate': 2.163799671860117e-06} [Rank 2] Trainer log: {'loss': 0.8489, 'grad_norm': 4.128507137298584, 'learning_rate': 2.163799671860117e-06} {'loss': 0.8489, 'grad_norm': 4.128507137298584, 'learning_rate': 2.163799671860117e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.6891, 'grad_norm': 2.0666658878326416, 'learning_rate': 2.1595651488022785e-06}[Rank 2] Trainer log: {'loss': 0.6891, 'grad_norm': 2.0666658878326416, 'learning_rate': 2.1595651488022785e-06}[Rank 3] Trainer log: {'loss': 0.6891, 'grad_norm': 2.0666658878326416, 'learning_rate': 2.1595651488022785e-06} [Rank 0] Trainer log: {'loss': 0.6891, 'grad_norm': 2.0666658878326416, 'learning_rate': 2.1595651488022785e-06} {'loss': 0.6891, 'grad_norm': 2.0666658878326416, 'learning_rate': 2.1595651488022785e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.7792, 'grad_norm': 2.5435597896575928, 'learning_rate': 2.1553342716335458e-06}[Rank 3] Trainer log: {'loss': 0.7792, 'grad_norm': 2.5435597896575928, 'learning_rate': 2.1553342716335458e-06}[Rank 0] Trainer log: {'loss': 0.7792, 'grad_norm': 2.5435597896575928, 'learning_rate': 2.1553342716335458e-06} [Rank 2] Trainer log: {'loss': 0.7792, 'grad_norm': 2.5435597896575928, 'learning_rate': 2.1553342716335458e-06} {'loss': 0.7792, 'grad_norm': 2.5435597896575928, 'learning_rate': 2.1553342716335458e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.849, 'grad_norm': 4.472328186035156, 'learning_rate': 2.1511070423213164e-06}[Rank 2] Trainer log: {'loss': 0.849, 'grad_norm': 4.472328186035156, 'learning_rate': 2.1511070423213164e-06} [Rank 0] Trainer log: {'loss': 0.849, 'grad_norm': 4.472328186035156, 'learning_rate': 2.1511070423213164e-06}[Rank 3] Trainer log: {'loss': 0.849, 'grad_norm': 4.472328186035156, 'learning_rate': 2.1511070423213164e-06} {'loss': 0.849, 'grad_norm': 4.472328186035156, 'learning_rate': 2.1511070423213164e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.9406, 'grad_norm': 4.0452656745910645, 'learning_rate': 2.146883462831306e-06}[Rank 1] Trainer log: {'loss': 0.9406, 'grad_norm': 4.0452656745910645, 'learning_rate': 2.146883462831306e-06} [Rank 2] Trainer log: {'loss': 0.9406, 'grad_norm': 4.0452656745910645, 'learning_rate': 2.146883462831306e-06} [Rank 0] Trainer log: {'loss': 0.9406, 'grad_norm': 4.0452656745910645, 'learning_rate': 2.146883462831306e-06} {'loss': 0.9406, 'grad_norm': 4.0452656745910645, 'learning_rate': 2.146883462831306e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.7661, 'grad_norm': 1.9666451215744019, 'learning_rate': 2.1426635351275227e-06}[Rank 1] Trainer log: {'loss': 0.7661, 'grad_norm': 1.9666451215744019, 'learning_rate': 2.1426635351275227e-06}[Rank 2] Trainer log: {'loss': 0.7661, 'grad_norm': 1.9666451215744019, 'learning_rate': 2.1426635351275227e-06} [Rank 0] Trainer log: {'loss': 0.7661, 'grad_norm': 1.9666451215744019, 'learning_rate': 2.1426635351275227e-06} {'loss': 0.7661, 'grad_norm': 1.9666451215744019, 'learning_rate': 2.1426635351275227e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.9917, 'grad_norm': 5.339548110961914, 'learning_rate': 2.138447261172277e-06}[Rank 1] Trainer log: {'loss': 0.9917, 'grad_norm': 5.339548110961914, 'learning_rate': 2.138447261172277e-06}[Rank 2] Trainer log: {'loss': 0.9917, 'grad_norm': 5.339548110961914, 'learning_rate': 2.138447261172277e-06} [Rank 0] Trainer log: {'loss': 0.9917, 'grad_norm': 5.339548110961914, 'learning_rate': 2.138447261172277e-06} {'loss': 0.9917, 'grad_norm': 5.339548110961914, 'learning_rate': 2.138447261172277e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 0.7984, 'grad_norm': 3.3745508193969727, 'learning_rate': 2.1342346429261885e-06}[Rank 0] Trainer log: {'loss': 0.7984, 'grad_norm': 3.3745508193969727, 'learning_rate': 2.1342346429261885e-06}[Rank 3] Trainer log: {'loss': 0.7984, 'grad_norm': 3.3745508193969727, 'learning_rate': 2.1342346429261885e-06} [Rank 1] Trainer log: {'loss': 0.7984, 'grad_norm': 3.3745508193969727, 'learning_rate': 2.1342346429261885e-06} {'loss': 0.7984, 'grad_norm': 3.3745508193969727, 'learning_rate': 2.1342346429261885e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.9342, 'grad_norm': 5.531064510345459, 'learning_rate': 2.130025682348168e-06}[Rank 1] Trainer log: {'loss': 0.9342, 'grad_norm': 5.531064510345459, 'learning_rate': 2.130025682348168e-06}[Rank 3] Trainer log: {'loss': 0.9342, 'grad_norm': 5.531064510345459, 'learning_rate': 2.130025682348168e-06} [Rank 2] Trainer log: {'loss': 0.9342, 'grad_norm': 5.531064510345459, 'learning_rate': 2.130025682348168e-06} {'loss': 0.9342, 'grad_norm': 5.531064510345459, 'learning_rate': 2.130025682348168e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.9298, 'grad_norm': 2.452343463897705, 'learning_rate': 2.125820381395426e-06}[Rank 3] Trainer log: {'loss': 0.9298, 'grad_norm': 2.452343463897705, 'learning_rate': 2.125820381395426e-06} [Rank 2] Trainer log: {'loss': 0.9298, 'grad_norm': 2.452343463897705, 'learning_rate': 2.125820381395426e-06} [Rank 0] Trainer log: {'loss': 0.9298, 'grad_norm': 2.452343463897705, 'learning_rate': 2.125820381395426e-06} {'loss': 0.9298, 'grad_norm': 2.452343463897705, 'learning_rate': 2.125820381395426e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 0.7157, 'grad_norm': 4.207551002502441, 'learning_rate': 2.121618742023478e-06}[Rank 3] Trainer log: {'loss': 0.7157, 'grad_norm': 4.207551002502441, 'learning_rate': 2.121618742023478e-06} [Rank 1] Trainer log: {'loss': 0.7157, 'grad_norm': 4.207551002502441, 'learning_rate': 2.121618742023478e-06} [Rank 0] Trainer log: {'loss': 0.7157, 'grad_norm': 4.207551002502441, 'learning_rate': 2.121618742023478e-06} {'loss': 0.7157, 'grad_norm': 4.207551002502441, 'learning_rate': 2.121618742023478e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.6942, 'grad_norm': 2.1225929260253906, 'learning_rate': 2.1174207661861355e-06}[Rank 3] Trainer log: {'loss': 0.6942, 'grad_norm': 2.1225929260253906, 'learning_rate': 2.1174207661861355e-06} [Rank 2] Trainer log: {'loss': 0.6942, 'grad_norm': 2.1225929260253906, 'learning_rate': 2.1174207661861355e-06} [Rank 1] Trainer log: {'loss': 0.6942, 'grad_norm': 2.1225929260253906, 'learning_rate': 2.1174207661861355e-06} {'loss': 0.6942, 'grad_norm': 2.1225929260253906, 'learning_rate': 2.1174207661861355e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.9464, 'grad_norm': 7.210999488830566, 'learning_rate': 2.1132264558354967e-06}[Rank 0] Trainer log: {'loss': 0.9464, 'grad_norm': 7.210999488830566, 'learning_rate': 2.1132264558354967e-06} [Rank 3] Trainer log: {'loss': 0.9464, 'grad_norm': 7.210999488830566, 'learning_rate': 2.1132264558354967e-06} [Rank 2] Trainer log: {'loss': 0.9464, 'grad_norm': 7.210999488830566, 'learning_rate': 2.1132264558354967e-06} {'loss': 0.9464, 'grad_norm': 7.210999488830566, 'learning_rate': 2.1132264558354967e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.5553, 'grad_norm': 12.050198554992676, 'learning_rate': 2.109035812921969e-06}[Rank 1] Trainer log: {'loss': 0.5553, 'grad_norm': 12.050198554992676, 'learning_rate': 2.109035812921969e-06} [Rank 3] Trainer log: {'loss': 0.5553, 'grad_norm': 12.050198554992676, 'learning_rate': 2.109035812921969e-06} [Rank 2] Trainer log: {'loss': 0.5553, 'grad_norm': 12.050198554992676, 'learning_rate': 2.109035812921969e-06} {'loss': 0.5553, 'grad_norm': 12.050198554992676, 'learning_rate': 2.109035812921969e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 1.0079, 'grad_norm': 2.173792839050293, 'learning_rate': 2.1048488393942455e-06} [Rank 1] Trainer log: {'loss': 1.0079, 'grad_norm': 2.173792839050293, 'learning_rate': 2.1048488393942455e-06}[Rank 0] Trainer log: {'loss': 1.0079, 'grad_norm': 2.173792839050293, 'learning_rate': 2.1048488393942455e-06} [Rank 3] Trainer log: {'loss': 1.0079, 'grad_norm': 2.173792839050293, 'learning_rate': 2.1048488393942455e-06} {'loss': 1.0079, 'grad_norm': 2.173792839050293, 'learning_rate': 2.1048488393942455e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.483, 'grad_norm': 6.572681903839111, 'learning_rate': 2.1006655371993122e-06}[Rank 1] Trainer log: {'loss': 0.483, 'grad_norm': 6.572681903839111, 'learning_rate': 2.1006655371993122e-06} [Rank 0] Trainer log: {'loss': 0.483, 'grad_norm': 6.572681903839111, 'learning_rate': 2.1006655371993122e-06} [Rank 2] Trainer log: {'loss': 0.483, 'grad_norm': 6.572681903839111, 'learning_rate': 2.1006655371993122e-06} {'loss': 0.483, 'grad_norm': 6.572681903839111, 'learning_rate': 2.1006655371993122e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.7594, 'grad_norm': 16.230501174926758, 'learning_rate': 2.096485908282456e-06}[Rank 3] Trainer log: {'loss': 0.7594, 'grad_norm': 16.230501174926758, 'learning_rate': 2.096485908282456e-06}[Rank 2] Trainer log: {'loss': 0.7594, 'grad_norm': 16.230501174926758, 'learning_rate': 2.096485908282456e-06} [Rank 0] Trainer log: {'loss': 0.7594, 'grad_norm': 16.230501174926758, 'learning_rate': 2.096485908282456e-06} {'loss': 0.7594, 'grad_norm': 16.230501174926758, 'learning_rate': 2.096485908282456e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.8595, 'grad_norm': 9.47409439086914, 'learning_rate': 2.092309954587245e-06}[Rank 2] Trainer log: {'loss': 0.8595, 'grad_norm': 9.47409439086914, 'learning_rate': 2.092309954587245e-06}[Rank 0] Trainer log: {'loss': 0.8595, 'grad_norm': 9.47409439086914, 'learning_rate': 2.092309954587245e-06} [Rank 1] Trainer log: {'loss': 0.8595, 'grad_norm': 9.47409439086914, 'learning_rate': 2.092309954587245e-06} {'loss': 0.8595, 'grad_norm': 9.47409439086914, 'learning_rate': 2.092309954587245e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 0.5238, 'grad_norm': 5.117995738983154, 'learning_rate': 2.0881376780555506e-06}[Rank 0] Trainer log: {'loss': 0.5238, 'grad_norm': 5.117995738983154, 'learning_rate': 2.0881376780555506e-06} [Rank 1] Trainer log: {'loss': 0.5238, 'grad_norm': 5.117995738983154, 'learning_rate': 2.0881376780555506e-06} [Rank 3] Trainer log: {'loss': 0.5238, 'grad_norm': 5.117995738983154, 'learning_rate': 2.0881376780555506e-06} {'loss': 0.5238, 'grad_norm': 5.117995738983154, 'learning_rate': 2.0881376780555506e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 1.0511, 'grad_norm': 2.0477294921875, 'learning_rate': 2.0839690806275213e-06}[Rank 1] Trainer log: {'loss': 1.0511, 'grad_norm': 2.0477294921875, 'learning_rate': 2.0839690806275213e-06}[Rank 2] Trainer log: {'loss': 1.0511, 'grad_norm': 2.0477294921875, 'learning_rate': 2.0839690806275213e-06} [Rank 0] Trainer log: {'loss': 1.0511, 'grad_norm': 2.0477294921875, 'learning_rate': 2.0839690806275213e-06} {'loss': 1.0511, 'grad_norm': 2.0477294921875, 'learning_rate': 2.0839690806275213e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.9068, 'grad_norm': 3.448622703552246, 'learning_rate': 2.0798041642416067e-06}[Rank 3] Trainer log: {'loss': 0.9068, 'grad_norm': 3.448622703552246, 'learning_rate': 2.0798041642416067e-06}[Rank 1] Trainer log: {'loss': 0.9068, 'grad_norm': 3.448622703552246, 'learning_rate': 2.0798041642416067e-06} [Rank 2] Trainer log: {'loss': 0.9068, 'grad_norm': 3.448622703552246, 'learning_rate': 2.0798041642416067e-06} {'loss': 0.9068, 'grad_norm': 3.448622703552246, 'learning_rate': 2.0798041642416067e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.8376, 'grad_norm': 2.7350363731384277, 'learning_rate': 2.075642930834534e-06}[Rank 1] Trainer log: {'loss': 0.8376, 'grad_norm': 2.7350363731384277, 'learning_rate': 2.075642930834534e-06}[Rank 3] Trainer log: {'loss': 0.8376, 'grad_norm': 2.7350363731384277, 'learning_rate': 2.075642930834534e-06} {'loss': 0.8376, 'grad_norm': 2.7350363731384277, 'learning_rate': 2.075642930834534e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 0.8376, 'grad_norm': 2.7350363731384277, 'learning_rate': 2.075642930834534e-06} [Rank 3] Trainer log: {'loss': 0.97, 'grad_norm': 4.178371429443359, 'learning_rate': 2.0714853823413273e-06} [Rank 1] Trainer log: {'loss': 0.97, 'grad_norm': 4.178371429443359, 'learning_rate': 2.0714853823413273e-06}[Rank 0] Trainer log: {'loss': 0.97, 'grad_norm': 4.178371429443359, 'learning_rate': 2.0714853823413273e-06} {'loss': 0.97, 'grad_norm': 4.178371429443359, 'learning_rate': 2.0714853823413273e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 0.97, 'grad_norm': 4.178371429443359, 'learning_rate': 2.0714853823413273e-06} [Rank 0] Trainer log: {'loss': 1.0532, 'grad_norm': 2.4319300651550293, 'learning_rate': 2.0673315206952917e-06}[Rank 3] Trainer log: {'loss': 1.0532, 'grad_norm': 2.4319300651550293, 'learning_rate': 2.0673315206952917e-06} [Rank 1] Trainer log: {'loss': 1.0532, 'grad_norm': 2.4319300651550293, 'learning_rate': 2.0673315206952917e-06} [Rank 2] Trainer log: {'loss': 1.0532, 'grad_norm': 2.4319300651550293, 'learning_rate': 2.0673315206952917e-06} {'loss': 1.0532, 'grad_norm': 2.4319300651550293, 'learning_rate': 2.0673315206952917e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.9961, 'grad_norm': 3.136777639389038, 'learning_rate': 2.063181347828014e-06}[Rank 3] Trainer log: {'loss': 0.9961, 'grad_norm': 3.136777639389038, 'learning_rate': 2.063181347828014e-06}[Rank 2] Trainer log: {'loss': 0.9961, 'grad_norm': 3.136777639389038, 'learning_rate': 2.063181347828014e-06} [Rank 1] Trainer log: {'loss': 0.9961, 'grad_norm': 3.136777639389038, 'learning_rate': 2.063181347828014e-06} {'loss': 0.9961, 'grad_norm': 3.136777639389038, 'learning_rate': 2.063181347828014e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.7091, 'grad_norm': 1.5626147985458374, 'learning_rate': 2.0590348656693782e-06} [Rank 0] Trainer log: {'loss': 0.7091, 'grad_norm': 1.5626147985458374, 'learning_rate': 2.0590348656693782e-06}[Rank 1] Trainer log: {'loss': 0.7091, 'grad_norm': 1.5626147985458374, 'learning_rate': 2.0590348656693782e-06} [Rank 2] Trainer log: {'loss': 0.7091, 'grad_norm': 1.5626147985458374, 'learning_rate': 2.0590348656693782e-06} {'loss': 0.7091, 'grad_norm': 1.5626147985458374, 'learning_rate': 2.0590348656693782e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.8805, 'grad_norm': 3.1970274448394775, 'learning_rate': 2.0548920761475367e-06}[Rank 2] Trainer log: {'loss': 0.8805, 'grad_norm': 3.1970274448394775, 'learning_rate': 2.0548920761475367e-06}[Rank 3] Trainer log: {'loss': 0.8805, 'grad_norm': 3.1970274448394775, 'learning_rate': 2.0548920761475367e-06} {'loss': 0.8805, 'grad_norm': 3.1970274448394775, 'learning_rate': 2.0548920761475367e-06, 'epoch': 0.8} [Rank 1] Trainer log: {'loss': 0.8805, 'grad_norm': 3.1970274448394775, 'learning_rate': 2.0548920761475367e-06} [Rank 0] Trainer log: {'loss': 0.5452, 'grad_norm': 11.876959800720215, 'learning_rate': 2.0507529811889404e-06}[Rank 2] Trainer log: {'loss': 0.5452, 'grad_norm': 11.876959800720215, 'learning_rate': 2.0507529811889404e-06} [Rank 1] Trainer log: {'loss': 0.5452, 'grad_norm': 11.876959800720215, 'learning_rate': 2.0507529811889404e-06} [Rank 3] Trainer log: {'loss': 0.5452, 'grad_norm': 11.876959800720215, 'learning_rate': 2.0507529811889404e-06} {'loss': 0.5452, 'grad_norm': 11.876959800720215, 'learning_rate': 2.0507529811889404e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.633, 'grad_norm': 5.3465962409973145, 'learning_rate': 2.0466175827183077e-06} [Rank 2] Trainer log: {'loss': 0.633, 'grad_norm': 5.3465962409973145, 'learning_rate': 2.0466175827183077e-06}[Rank 1] Trainer log: {'loss': 0.633, 'grad_norm': 5.3465962409973145, 'learning_rate': 2.0466175827183077e-06} [Rank 0] Trainer log: {'loss': 0.633, 'grad_norm': 5.3465962409973145, 'learning_rate': 2.0466175827183077e-06} {'loss': 0.633, 'grad_norm': 5.3465962409973145, 'learning_rate': 2.0466175827183077e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.5357, 'grad_norm': 6.261940956115723, 'learning_rate': 2.04248588265865e-06}[Rank 2] Trainer log: {'loss': 0.5357, 'grad_norm': 6.261940956115723, 'learning_rate': 2.04248588265865e-06} [Rank 1] Trainer log: {'loss': 0.5357, 'grad_norm': 6.261940956115723, 'learning_rate': 2.04248588265865e-06} [Rank 0] Trainer log: {'loss': 0.5357, 'grad_norm': 6.261940956115723, 'learning_rate': 2.04248588265865e-06} {'loss': 0.5357, 'grad_norm': 6.261940956115723, 'learning_rate': 2.04248588265865e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.9873, 'grad_norm': 7.360349655151367, 'learning_rate': 2.038357882931251e-06} [Rank 2] Trainer log: {'loss': 0.9873, 'grad_norm': 7.360349655151367, 'learning_rate': 2.038357882931251e-06} [Rank 1] Trainer log: {'loss': 0.9873, 'grad_norm': 7.360349655151367, 'learning_rate': 2.038357882931251e-06} [Rank 0] Trainer log: {'loss': 0.9873, 'grad_norm': 7.360349655151367, 'learning_rate': 2.038357882931251e-06} {'loss': 0.9873, 'grad_norm': 7.360349655151367, 'learning_rate': 2.038357882931251e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.8458, 'grad_norm': 6.227353572845459, 'learning_rate': 2.0342335854556738e-06}[Rank 2] Trainer log: {'loss': 0.8458, 'grad_norm': 6.227353572845459, 'learning_rate': 2.0342335854556738e-06}[Rank 3] Trainer log: {'loss': 0.8458, 'grad_norm': 6.227353572845459, 'learning_rate': 2.0342335854556738e-06} [Rank 1] Trainer log: {'loss': 0.8458, 'grad_norm': 6.227353572845459, 'learning_rate': 2.0342335854556738e-06}{'loss': 0.8458, 'grad_norm': 6.227353572845459, 'learning_rate': 2.0342335854556738e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.9416, 'grad_norm': 2.6981966495513916, 'learning_rate': 2.0301129921497677e-06}[Rank 0] Trainer log: {'loss': 0.9416, 'grad_norm': 2.6981966495513916, 'learning_rate': 2.0301129921497677e-06}[Rank 2] Trainer log: {'loss': 0.9416, 'grad_norm': 2.6981966495513916, 'learning_rate': 2.0301129921497677e-06} [Rank 1] Trainer log: {'loss': 0.9416, 'grad_norm': 2.6981966495513916, 'learning_rate': 2.0301129921497677e-06} {'loss': 0.9416, 'grad_norm': 2.6981966495513916, 'learning_rate': 2.0301129921497677e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 0.8818, 'grad_norm': 10.424644470214844, 'learning_rate': 2.0259961049296517e-06} [Rank 3] Trainer log: {'loss': 0.8818, 'grad_norm': 10.424644470214844, 'learning_rate': 2.0259961049296517e-06} [Rank 1] Trainer log: {'loss': 0.8818, 'grad_norm': 10.424644470214844, 'learning_rate': 2.0259961049296517e-06} [Rank 0] Trainer log: {'loss': 0.8818, 'grad_norm': 10.424644470214844, 'learning_rate': 2.0259961049296517e-06} {'loss': 0.8818, 'grad_norm': 10.424644470214844, 'learning_rate': 2.0259961049296517e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.791, 'grad_norm': 4.267749309539795, 'learning_rate': 2.0218829257097216e-06} [Rank 2] Trainer log: {'loss': 0.791, 'grad_norm': 4.267749309539795, 'learning_rate': 2.0218829257097216e-06} [Rank 1] Trainer log: {'loss': 0.791, 'grad_norm': 4.267749309539795, 'learning_rate': 2.0218829257097216e-06} [Rank 0] Trainer log: {'loss': 0.791, 'grad_norm': 4.267749309539795, 'learning_rate': 2.0218829257097216e-06} {'loss': 0.791, 'grad_norm': 4.267749309539795, 'learning_rate': 2.0218829257097216e-06, 'epoch': 0.8} [Rank 3] Trainer log: {'loss': 0.8571, 'grad_norm': 6.739546298980713, 'learning_rate': 2.0177734564026543e-06} [Rank 2] Trainer log: {'loss': 0.8571, 'grad_norm': 6.739546298980713, 'learning_rate': 2.0177734564026543e-06} [Rank 1] Trainer log: {'loss': 0.8571, 'grad_norm': 6.739546298980713, 'learning_rate': 2.0177734564026543e-06} [Rank 0] Trainer log: {'loss': 0.8571, 'grad_norm': 6.739546298980713, 'learning_rate': 2.0177734564026543e-06} {'loss': 0.8571, 'grad_norm': 6.739546298980713, 'learning_rate': 2.0177734564026543e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.7382, 'grad_norm': 7.130889892578125, 'learning_rate': 2.0136676989193994e-06}[Rank 3] Trainer log: {'loss': 0.7382, 'grad_norm': 7.130889892578125, 'learning_rate': 2.0136676989193994e-06} [Rank 2] Trainer log: {'loss': 0.7382, 'grad_norm': 7.130889892578125, 'learning_rate': 2.0136676989193994e-06} [Rank 1] Trainer log: {'loss': 0.7382, 'grad_norm': 7.130889892578125, 'learning_rate': 2.0136676989193994e-06} {'loss': 0.7382, 'grad_norm': 7.130889892578125, 'learning_rate': 2.0136676989193994e-06, 'epoch': 0.8} [Rank 0] Trainer log: {'loss': 0.9114, 'grad_norm': 1.7664796113967896, 'learning_rate': 2.0095656551691756e-06}[Rank 3] Trainer log: {'loss': 0.9114, 'grad_norm': 1.7664796113967896, 'learning_rate': 2.0095656551691756e-06}[Rank 1] Trainer log: {'loss': 0.9114, 'grad_norm': 1.7664796113967896, 'learning_rate': 2.0095656551691756e-06} [Rank 2] Trainer log: {'loss': 0.9114, 'grad_norm': 1.7664796113967896, 'learning_rate': 2.0095656551691756e-06} {'loss': 0.9114, 'grad_norm': 1.7664796113967896, 'learning_rate': 2.0095656551691756e-06, 'epoch': 0.8} [Rank 2] Trainer log: {'loss': 0.5712, 'grad_norm': 4.68580436706543, 'learning_rate': 2.005467327059485e-06} [Rank 0] Trainer log: {'loss': 0.5712, 'grad_norm': 4.68580436706543, 'learning_rate': 2.005467327059485e-06}[Rank 3] Trainer log: {'loss': 0.5712, 'grad_norm': 4.68580436706543, 'learning_rate': 2.005467327059485e-06} [Rank 1] Trainer log: {'loss': 0.5712, 'grad_norm': 4.68580436706543, 'learning_rate': 2.005467327059485e-06} {'loss': 0.5712, 'grad_norm': 4.68580436706543, 'learning_rate': 2.005467327059485e-06, 'epoch': 0.81} [Rank 2] Trainer log: {'loss': 0.7896, 'grad_norm': 4.132787227630615, 'learning_rate': 2.0013727164960904e-06}[Rank 1] Trainer log: {'loss': 0.7896, 'grad_norm': 4.132787227630615, 'learning_rate': 2.0013727164960904e-06}[Rank 3] Trainer log: {'loss': 0.7896, 'grad_norm': 4.132787227630615, 'learning_rate': 2.0013727164960904e-06} [Rank 0] Trainer log: {'loss': 0.7896, 'grad_norm': 4.132787227630615, 'learning_rate': 2.0013727164960904e-06} {'loss': 0.7896, 'grad_norm': 4.132787227630615, 'learning_rate': 2.0013727164960904e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.6228, 'grad_norm': 4.781400203704834, 'learning_rate': 1.997281825383031e-06}[Rank 3] Trainer log: {'loss': 0.6228, 'grad_norm': 4.781400203704834, 'learning_rate': 1.997281825383031e-06} [Rank 0] Trainer log: {'loss': 0.6228, 'grad_norm': 4.781400203704834, 'learning_rate': 1.997281825383031e-06}[Rank 2] Trainer log: {'loss': 0.6228, 'grad_norm': 4.781400203704834, 'learning_rate': 1.997281825383031e-06} {'loss': 0.6228, 'grad_norm': 4.781400203704834, 'learning_rate': 1.997281825383031e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.8432, 'grad_norm': 6.224388599395752, 'learning_rate': 1.99319465562262e-06}[Rank 1] Trainer log: {'loss': 0.8432, 'grad_norm': 6.224388599395752, 'learning_rate': 1.99319465562262e-06} [Rank 2] Trainer log: {'loss': 0.8432, 'grad_norm': 6.224388599395752, 'learning_rate': 1.99319465562262e-06} [Rank 0] Trainer log: {'loss': 0.8432, 'grad_norm': 6.224388599395752, 'learning_rate': 1.99319465562262e-06} {'loss': 0.8432, 'grad_norm': 6.224388599395752, 'learning_rate': 1.99319465562262e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.8897, 'grad_norm': 9.305953025817871, 'learning_rate': 1.9891112091154354e-06}[Rank 3] Trainer log: {'loss': 0.8897, 'grad_norm': 9.305953025817871, 'learning_rate': 1.9891112091154354e-06} [Rank 1] Trainer log: {'loss': 0.8897, 'grad_norm': 9.305953025817871, 'learning_rate': 1.9891112091154354e-06} {'loss': 0.8897, 'grad_norm': 9.305953025817871, 'learning_rate': 1.9891112091154354e-06, 'epoch': 0.81}[Rank 2] Trainer log: {'loss': 0.8897, 'grad_norm': 9.305953025817871, 'learning_rate': 1.9891112091154354e-06} [Rank 0] Trainer log: {'loss': 0.9563, 'grad_norm': 7.937966823577881, 'learning_rate': 1.985031487760322e-06}[Rank 3] Trainer log: {'loss': 0.9563, 'grad_norm': 7.937966823577881, 'learning_rate': 1.985031487760322e-06} [Rank 1] Trainer log: {'loss': 0.9563, 'grad_norm': 7.937966823577881, 'learning_rate': 1.985031487760322e-06} [Rank 2] Trainer log: {'loss': 0.9563, 'grad_norm': 7.937966823577881, 'learning_rate': 1.985031487760322e-06} {'loss': 0.9563, 'grad_norm': 7.937966823577881, 'learning_rate': 1.985031487760322e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.874, 'grad_norm': 2.660583972930908, 'learning_rate': 1.980955493454397e-06}[Rank 2] Trainer log: {'loss': 0.874, 'grad_norm': 2.660583972930908, 'learning_rate': 1.980955493454397e-06}[Rank 3] Trainer log: {'loss': 0.874, 'grad_norm': 2.660583972930908, 'learning_rate': 1.980955493454397e-06} [Rank 1] Trainer log: {'loss': 0.874, 'grad_norm': 2.660583972930908, 'learning_rate': 1.980955493454397e-06} {'loss': 0.874, 'grad_norm': 2.660583972930908, 'learning_rate': 1.980955493454397e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.6427, 'grad_norm': 7.761508464813232, 'learning_rate': 1.9768832280930472e-06}[Rank 1] Trainer log: {'loss': 0.6427, 'grad_norm': 7.761508464813232, 'learning_rate': 1.9768832280930472e-06}[Rank 0] Trainer log: {'loss': 0.6427, 'grad_norm': 7.761508464813232, 'learning_rate': 1.9768832280930472e-06} [Rank 2] Trainer log: {'loss': 0.6427, 'grad_norm': 7.761508464813232, 'learning_rate': 1.9768832280930472e-06} {'loss': 0.6427, 'grad_norm': 7.761508464813232, 'learning_rate': 1.9768832280930472e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.6343, 'grad_norm': 6.885963439941406, 'learning_rate': 1.9728146935699145e-06}[Rank 3] Trainer log: {'loss': 0.6343, 'grad_norm': 6.885963439941406, 'learning_rate': 1.9728146935699145e-06} [Rank 2] Trainer log: {'loss': 0.6343, 'grad_norm': 6.885963439941406, 'learning_rate': 1.9728146935699145e-06} [Rank 0] Trainer log: {'loss': 0.6343, 'grad_norm': 6.885963439941406, 'learning_rate': 1.9728146935699145e-06} {'loss': 0.6343, 'grad_norm': 6.885963439941406, 'learning_rate': 1.9728146935699145e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.8655, 'grad_norm': 4.214710712432861, 'learning_rate': 1.968749891776918e-06}[Rank 0] Trainer log: {'loss': 0.8655, 'grad_norm': 4.214710712432861, 'learning_rate': 1.968749891776918e-06} [Rank 2] Trainer log: {'loss': 0.8655, 'grad_norm': 4.214710712432861, 'learning_rate': 1.968749891776918e-06} [Rank 1] Trainer log: {'loss': 0.8655, 'grad_norm': 4.214710712432861, 'learning_rate': 1.968749891776918e-06} {'loss': 0.8655, 'grad_norm': 4.214710712432861, 'learning_rate': 1.968749891776918e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.7979, 'grad_norm': 4.999901294708252, 'learning_rate': 1.964688824604234e-06}[Rank 0] Trainer log: {'loss': 0.7979, 'grad_norm': 4.999901294708252, 'learning_rate': 1.964688824604234e-06}[Rank 2] Trainer log: {'loss': 0.7979, 'grad_norm': 4.999901294708252, 'learning_rate': 1.964688824604234e-06} [Rank 1] Trainer log: {'loss': 0.7979, 'grad_norm': 4.999901294708252, 'learning_rate': 1.964688824604234e-06} {'loss': 0.7979, 'grad_norm': 4.999901294708252, 'learning_rate': 1.964688824604234e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.7939, 'grad_norm': 1.8903149366378784, 'learning_rate': 1.9606314939403003e-06}[Rank 3] Trainer log: {'loss': 0.7939, 'grad_norm': 1.8903149366378784, 'learning_rate': 1.9606314939403003e-06}[Rank 1] Trainer log: {'loss': 0.7939, 'grad_norm': 1.8903149366378784, 'learning_rate': 1.9606314939403003e-06} [Rank 2] Trainer log: {'loss': 0.7939, 'grad_norm': 1.8903149366378784, 'learning_rate': 1.9606314939403003e-06} {'loss': 0.7939, 'grad_norm': 1.8903149366378784, 'learning_rate': 1.9606314939403003e-06, 'epoch': 0.81} [Rank 2] Trainer log: {'loss': 0.9634, 'grad_norm': 2.845538377761841, 'learning_rate': 1.9565779016718257e-06}[Rank 1] Trainer log: {'loss': 0.9634, 'grad_norm': 2.845538377761841, 'learning_rate': 1.9565779016718257e-06}[Rank 3] Trainer log: {'loss': 0.9634, 'grad_norm': 2.845538377761841, 'learning_rate': 1.9565779016718257e-06} [Rank 0] Trainer log: {'loss': 0.9634, 'grad_norm': 2.845538377761841, 'learning_rate': 1.9565779016718257e-06} {'loss': 0.9634, 'grad_norm': 2.845538377761841, 'learning_rate': 1.9565779016718257e-06, 'epoch': 0.81} [Rank 2] Trainer log: {'loss': 0.9785, 'grad_norm': 4.242344379425049, 'learning_rate': 1.952528049683774e-06} [Rank 1] Trainer log: {'loss': 0.9785, 'grad_norm': 4.242344379425049, 'learning_rate': 1.952528049683774e-06} [Rank 3] Trainer log: {'loss': 0.9785, 'grad_norm': 4.242344379425049, 'learning_rate': 1.952528049683774e-06} [Rank 0] Trainer log: {'loss': 0.9785, 'grad_norm': 4.242344379425049, 'learning_rate': 1.952528049683774e-06} {'loss': 0.9785, 'grad_norm': 4.242344379425049, 'learning_rate': 1.952528049683774e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.6403, 'grad_norm': 2.430453062057495, 'learning_rate': 1.948481939859367e-06}[Rank 1] Trainer log: {'loss': 0.6403, 'grad_norm': 2.430453062057495, 'learning_rate': 1.948481939859367e-06} [Rank 0] Trainer log: {'loss': 0.6403, 'grad_norm': 2.430453062057495, 'learning_rate': 1.948481939859367e-06} [Rank 2] Trainer log: {'loss': 0.6403, 'grad_norm': 2.430453062057495, 'learning_rate': 1.948481939859367e-06} {'loss': 0.6403, 'grad_norm': 2.430453062057495, 'learning_rate': 1.948481939859367e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.9747, 'grad_norm': 2.972210645675659, 'learning_rate': 1.9444395740800948e-06}[Rank 1] Trainer log: {'loss': 0.9747, 'grad_norm': 2.972210645675659, 'learning_rate': 1.9444395740800948e-06} [Rank 3] Trainer log: {'loss': 0.9747, 'grad_norm': 2.972210645675659, 'learning_rate': 1.9444395740800948e-06} [Rank 2] Trainer log: {'loss': 0.9747, 'grad_norm': 2.972210645675659, 'learning_rate': 1.9444395740800948e-06} {'loss': 0.9747, 'grad_norm': 2.972210645675659, 'learning_rate': 1.9444395740800948e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.5688, 'grad_norm': 4.506748676300049, 'learning_rate': 1.940400954225704e-06} [Rank 2] Trainer log: {'loss': 0.5688, 'grad_norm': 4.506748676300049, 'learning_rate': 1.940400954225704e-06} [Rank 1] Trainer log: {'loss': 0.5688, 'grad_norm': 4.506748676300049, 'learning_rate': 1.940400954225704e-06} [Rank 0] Trainer log: {'loss': 0.5688, 'grad_norm': 4.506748676300049, 'learning_rate': 1.940400954225704e-06} {'loss': 0.5688, 'grad_norm': 4.506748676300049, 'learning_rate': 1.940400954225704e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 1.042, 'grad_norm': 2.8170249462127686, 'learning_rate': 1.9363660821741947e-06}[Rank 0] Trainer log: {'loss': 1.042, 'grad_norm': 2.8170249462127686, 'learning_rate': 1.9363660821741947e-06} [Rank 2] Trainer log: {'loss': 1.042, 'grad_norm': 2.8170249462127686, 'learning_rate': 1.9363660821741947e-06} [Rank 1] Trainer log: {'loss': 1.042, 'grad_norm': 2.8170249462127686, 'learning_rate': 1.9363660821741947e-06} {'loss': 1.042, 'grad_norm': 2.8170249462127686, 'learning_rate': 1.9363660821741947e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.7358, 'grad_norm': 6.827178955078125, 'learning_rate': 1.9323349598018247e-06}[Rank 0] Trainer log: {'loss': 0.7358, 'grad_norm': 6.827178955078125, 'learning_rate': 1.9323349598018247e-06}[Rank 1] Trainer log: {'loss': 0.7358, 'grad_norm': 6.827178955078125, 'learning_rate': 1.9323349598018247e-06} [Rank 2] Trainer log: {'loss': 0.7358, 'grad_norm': 6.827178955078125, 'learning_rate': 1.9323349598018247e-06} {'loss': 0.7358, 'grad_norm': 6.827178955078125, 'learning_rate': 1.9323349598018247e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.7722, 'grad_norm': 2.454408645629883, 'learning_rate': 1.928307588983117e-06}[Rank 3] Trainer log: {'loss': 0.7722, 'grad_norm': 2.454408645629883, 'learning_rate': 1.928307588983117e-06}[Rank 1] Trainer log: {'loss': 0.7722, 'grad_norm': 2.454408645629883, 'learning_rate': 1.928307588983117e-06} [Rank 2] Trainer log: {'loss': 0.7722, 'grad_norm': 2.454408645629883, 'learning_rate': 1.928307588983117e-06} {'loss': 0.7722, 'grad_norm': 2.454408645629883, 'learning_rate': 1.928307588983117e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.8177, 'grad_norm': 2.051642417907715, 'learning_rate': 1.92428397159084e-06}[Rank 1] Trainer log: {'loss': 0.8177, 'grad_norm': 2.051642417907715, 'learning_rate': 1.92428397159084e-06} [Rank 0] Trainer log: {'loss': 0.8177, 'grad_norm': 2.051642417907715, 'learning_rate': 1.92428397159084e-06} [Rank 2] Trainer log: {'loss': 0.8177, 'grad_norm': 2.051642417907715, 'learning_rate': 1.92428397159084e-06} {'loss': 0.8177, 'grad_norm': 2.051642417907715, 'learning_rate': 1.92428397159084e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.848, 'grad_norm': 6.6601481437683105, 'learning_rate': 1.9202641094960185e-06}[Rank 0] Trainer log: {'loss': 0.848, 'grad_norm': 6.6601481437683105, 'learning_rate': 1.9202641094960185e-06} [Rank 1] Trainer log: {'loss': 0.848, 'grad_norm': 6.6601481437683105, 'learning_rate': 1.9202641094960185e-06}[Rank 2] Trainer log: {'loss': 0.848, 'grad_norm': 6.6601481437683105, 'learning_rate': 1.9202641094960185e-06} {'loss': 0.848, 'grad_norm': 6.6601481437683105, 'learning_rate': 1.9202641094960185e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.8917, 'grad_norm': 6.15248441696167, 'learning_rate': 1.9162480045679365e-06} [Rank 3] Trainer log: {'loss': 0.8917, 'grad_norm': 6.15248441696167, 'learning_rate': 1.9162480045679365e-06} [Rank 2] Trainer log: {'loss': 0.8917, 'grad_norm': 6.15248441696167, 'learning_rate': 1.9162480045679365e-06} [Rank 0] Trainer log: {'loss': 0.8917, 'grad_norm': 6.15248441696167, 'learning_rate': 1.9162480045679365e-06} {'loss': 0.8917, 'grad_norm': 6.15248441696167, 'learning_rate': 1.9162480045679365e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 1.0662, 'grad_norm': 4.759413242340088, 'learning_rate': 1.912235658674123e-06}[Rank 3] Trainer log: {'loss': 1.0662, 'grad_norm': 4.759413242340088, 'learning_rate': 1.912235658674123e-06} [Rank 2] Trainer log: {'loss': 1.0662, 'grad_norm': 4.759413242340088, 'learning_rate': 1.912235658674123e-06} [Rank 1] Trainer log: {'loss': 1.0662, 'grad_norm': 4.759413242340088, 'learning_rate': 1.912235658674123e-06} {'loss': 1.0662, 'grad_norm': 4.759413242340088, 'learning_rate': 1.912235658674123e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.909, 'grad_norm': 1.5644314289093018, 'learning_rate': 1.908227073680368e-06} [Rank 1] Trainer log: {'loss': 0.909, 'grad_norm': 1.5644314289093018, 'learning_rate': 1.908227073680368e-06} [Rank 2] Trainer log: {'loss': 0.909, 'grad_norm': 1.5644314289093018, 'learning_rate': 1.908227073680368e-06} [Rank 0] Trainer log: {'loss': 0.909, 'grad_norm': 1.5644314289093018, 'learning_rate': 1.908227073680368e-06} {'loss': 0.909, 'grad_norm': 1.5644314289093018, 'learning_rate': 1.908227073680368e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.6304, 'grad_norm': 5.918346881866455, 'learning_rate': 1.904222251450707e-06}[Rank 3] Trainer log: {'loss': 0.6304, 'grad_norm': 5.918346881866455, 'learning_rate': 1.904222251450707e-06} [Rank 1] Trainer log: {'loss': 0.6304, 'grad_norm': 5.918346881866455, 'learning_rate': 1.904222251450707e-06} [Rank 2] Trainer log: {'loss': 0.6304, 'grad_norm': 5.918346881866455, 'learning_rate': 1.904222251450707e-06} {'loss': 0.6304, 'grad_norm': 5.918346881866455, 'learning_rate': 1.904222251450707e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.5981, 'grad_norm': 3.9498088359832764, 'learning_rate': 1.9002211938474257e-06}[Rank 3] Trainer log: {'loss': 0.5981, 'grad_norm': 3.9498088359832764, 'learning_rate': 1.9002211938474257e-06} [Rank 1] Trainer log: {'loss': 0.5981, 'grad_norm': 3.9498088359832764, 'learning_rate': 1.9002211938474257e-06} [Rank 2] Trainer log: {'loss': 0.5981, 'grad_norm': 3.9498088359832764, 'learning_rate': 1.9002211938474257e-06} {'loss': 0.5981, 'grad_norm': 3.9498088359832764, 'learning_rate': 1.9002211938474257e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.7174, 'grad_norm': 4.598562717437744, 'learning_rate': 1.896223902731058e-06}[Rank 0] Trainer log: {'loss': 0.7174, 'grad_norm': 4.598562717437744, 'learning_rate': 1.896223902731058e-06} [Rank 2] Trainer log: {'loss': 0.7174, 'grad_norm': 4.598562717437744, 'learning_rate': 1.896223902731058e-06} [Rank 3] Trainer log: {'loss': 0.7174, 'grad_norm': 4.598562717437744, 'learning_rate': 1.896223902731058e-06} {'loss': 0.7174, 'grad_norm': 4.598562717437744, 'learning_rate': 1.896223902731058e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.6339, 'grad_norm': 2.9272096157073975, 'learning_rate': 1.8922303799603935e-06}[Rank 0] Trainer log: {'loss': 0.6339, 'grad_norm': 2.9272096157073975, 'learning_rate': 1.8922303799603935e-06} [Rank 1] Trainer log: {'loss': 0.6339, 'grad_norm': 2.9272096157073975, 'learning_rate': 1.8922303799603935e-06} [Rank 2] Trainer log: {'loss': 0.6339, 'grad_norm': 2.9272096157073975, 'learning_rate': 1.8922303799603935e-06} {'loss': 0.6339, 'grad_norm': 2.9272096157073975, 'learning_rate': 1.8922303799603935e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.7056, 'grad_norm': 3.8104734420776367, 'learning_rate': 1.8882406273924614e-06}[Rank 1] Trainer log: {'loss': 0.7056, 'grad_norm': 3.8104734420776367, 'learning_rate': 1.8882406273924614e-06} [Rank 2] Trainer log: {'loss': 0.7056, 'grad_norm': 3.8104734420776367, 'learning_rate': 1.8882406273924614e-06} [Rank 0] Trainer log: {'loss': 0.7056, 'grad_norm': 3.8104734420776367, 'learning_rate': 1.8882406273924614e-06} {'loss': 0.7056, 'grad_norm': 3.8104734420776367, 'learning_rate': 1.8882406273924614e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.8903, 'grad_norm': 5.163686275482178, 'learning_rate': 1.8842546468825396e-06} [Rank 3] Trainer log: {'loss': 0.8903, 'grad_norm': 5.163686275482178, 'learning_rate': 1.8842546468825396e-06} [Rank 0] Trainer log: {'loss': 0.8903, 'grad_norm': 5.163686275482178, 'learning_rate': 1.8842546468825396e-06}[Rank 2] Trainer log: {'loss': 0.8903, 'grad_norm': 5.163686275482178, 'learning_rate': 1.8842546468825396e-06} {'loss': 0.8903, 'grad_norm': 5.163686275482178, 'learning_rate': 1.8842546468825396e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.9865, 'grad_norm': 3.8608200550079346, 'learning_rate': 1.8802724402841565e-06}[Rank 3] Trainer log: {'loss': 0.9865, 'grad_norm': 3.8608200550079346, 'learning_rate': 1.8802724402841565e-06}[Rank 1] Trainer log: {'loss': 0.9865, 'grad_norm': 3.8608200550079346, 'learning_rate': 1.8802724402841565e-06} [Rank 2] Trainer log: {'loss': 0.9865, 'grad_norm': 3.8608200550079346, 'learning_rate': 1.8802724402841565e-06} {'loss': 0.9865, 'grad_norm': 3.8608200550079346, 'learning_rate': 1.8802724402841565e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.7434, 'grad_norm': 6.137917995452881, 'learning_rate': 1.87629400944908e-06}[Rank 2] Trainer log: {'loss': 0.7434, 'grad_norm': 6.137917995452881, 'learning_rate': 1.87629400944908e-06} [Rank 0] Trainer log: {'loss': 0.7434, 'grad_norm': 6.137917995452881, 'learning_rate': 1.87629400944908e-06} [Rank 1] Trainer log: {'loss': 0.7434, 'grad_norm': 6.137917995452881, 'learning_rate': 1.87629400944908e-06} {'loss': 0.7434, 'grad_norm': 6.137917995452881, 'learning_rate': 1.87629400944908e-06, 'epoch': 0.81} [Rank 2] Trainer log: {'loss': 0.7367, 'grad_norm': 11.542950630187988, 'learning_rate': 1.8723193562273235e-06}[Rank 3] Trainer log: {'loss': 0.7367, 'grad_norm': 11.542950630187988, 'learning_rate': 1.8723193562273235e-06} [Rank 0] Trainer log: {'loss': 0.7367, 'grad_norm': 11.542950630187988, 'learning_rate': 1.8723193562273235e-06} [Rank 1] Trainer log: {'loss': 0.7367, 'grad_norm': 11.542950630187988, 'learning_rate': 1.8723193562273235e-06} {'loss': 0.7367, 'grad_norm': 11.542950630187988, 'learning_rate': 1.8723193562273235e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.7198, 'grad_norm': 5.9332733154296875, 'learning_rate': 1.8683484824671506e-06}[Rank 1] Trainer log: {'loss': 0.7198, 'grad_norm': 5.9332733154296875, 'learning_rate': 1.8683484824671506e-06}[Rank 0] Trainer log: {'loss': 0.7198, 'grad_norm': 5.9332733154296875, 'learning_rate': 1.8683484824671506e-06} [Rank 2] Trainer log: {'loss': 0.7198, 'grad_norm': 5.9332733154296875, 'learning_rate': 1.8683484824671506e-06} {'loss': 0.7198, 'grad_norm': 5.9332733154296875, 'learning_rate': 1.8683484824671506e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.9279, 'grad_norm': 3.185603380203247, 'learning_rate': 1.8643813900150564e-06} [Rank 0] Trainer log: {'loss': 0.9279, 'grad_norm': 3.185603380203247, 'learning_rate': 1.8643813900150564e-06}[Rank 2] Trainer log: {'loss': 0.9279, 'grad_norm': 3.185603380203247, 'learning_rate': 1.8643813900150564e-06} [Rank 1] Trainer log: {'loss': 0.9279, 'grad_norm': 3.185603380203247, 'learning_rate': 1.8643813900150564e-06} {'loss': 0.9279, 'grad_norm': 3.185603380203247, 'learning_rate': 1.8643813900150564e-06, 'epoch': 0.81} [Rank 2] Trainer log: {'loss': 0.9227, 'grad_norm': 3.4484596252441406, 'learning_rate': 1.8604180807157824e-06} [Rank 3] Trainer log: {'loss': 0.9227, 'grad_norm': 3.4484596252441406, 'learning_rate': 1.8604180807157824e-06} [Rank 0] Trainer log: {'loss': 0.9227, 'grad_norm': 3.4484596252441406, 'learning_rate': 1.8604180807157824e-06} [Rank 1] Trainer log: {'loss': 0.9227, 'grad_norm': 3.4484596252441406, 'learning_rate': 1.8604180807157824e-06} {'loss': 0.9227, 'grad_norm': 3.4484596252441406, 'learning_rate': 1.8604180807157824e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.6334, 'grad_norm': 6.092584609985352, 'learning_rate': 1.856458556412315e-06}[Rank 3] Trainer log: {'loss': 0.6334, 'grad_norm': 6.092584609985352, 'learning_rate': 1.856458556412315e-06} [Rank 2] Trainer log: {'loss': 0.6334, 'grad_norm': 6.092584609985352, 'learning_rate': 1.856458556412315e-06} [Rank 0] Trainer log: {'loss': 0.6334, 'grad_norm': 6.092584609985352, 'learning_rate': 1.856458556412315e-06} {'loss': 0.6334, 'grad_norm': 6.092584609985352, 'learning_rate': 1.856458556412315e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.839, 'grad_norm': 14.496936798095703, 'learning_rate': 1.8525028189458748e-06} [Rank 0] Trainer log: {'loss': 0.839, 'grad_norm': 14.496936798095703, 'learning_rate': 1.8525028189458748e-06}[Rank 2] Trainer log: {'loss': 0.839, 'grad_norm': 14.496936798095703, 'learning_rate': 1.8525028189458748e-06} [Rank 3] Trainer log: {'loss': 0.839, 'grad_norm': 14.496936798095703, 'learning_rate': 1.8525028189458748e-06} {'loss': 0.839, 'grad_norm': 14.496936798095703, 'learning_rate': 1.8525028189458748e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.9381, 'grad_norm': 2.916999340057373, 'learning_rate': 1.8485508701559219e-06} [Rank 0] Trainer log: {'loss': 0.9381, 'grad_norm': 2.916999340057373, 'learning_rate': 1.8485508701559219e-06}[Rank 1] Trainer log: {'loss': 0.9381, 'grad_norm': 2.916999340057373, 'learning_rate': 1.8485508701559219e-06} [Rank 2] Trainer log: {'loss': 0.9381, 'grad_norm': 2.916999340057373, 'learning_rate': 1.8485508701559219e-06} {'loss': 0.9381, 'grad_norm': 2.916999340057373, 'learning_rate': 1.8485508701559219e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.6584, 'grad_norm': 3.564635992050171, 'learning_rate': 1.84460271188016e-06}[Rank 3] Trainer log: {'loss': 0.6584, 'grad_norm': 3.564635992050171, 'learning_rate': 1.84460271188016e-06}[Rank 0] Trainer log: {'loss': 0.6584, 'grad_norm': 3.564635992050171, 'learning_rate': 1.84460271188016e-06} [Rank 2] Trainer log: {'loss': 0.6584, 'grad_norm': 3.564635992050171, 'learning_rate': 1.84460271188016e-06} {'loss': 0.6584, 'grad_norm': 3.564635992050171, 'learning_rate': 1.84460271188016e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.7438, 'grad_norm': 5.669586658477783, 'learning_rate': 1.8406583459545225e-06}[Rank 3] Trainer log: {'loss': 0.7438, 'grad_norm': 5.669586658477783, 'learning_rate': 1.8406583459545225e-06} [Rank 1] Trainer log: {'loss': 0.7438, 'grad_norm': 5.669586658477783, 'learning_rate': 1.8406583459545225e-06} [Rank 2] Trainer log: {'loss': 0.7438, 'grad_norm': 5.669586658477783, 'learning_rate': 1.8406583459545225e-06} {'loss': 0.7438, 'grad_norm': 5.669586658477783, 'learning_rate': 1.8406583459545225e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.6578, 'grad_norm': 7.809361934661865, 'learning_rate': 1.836717774213187e-06}[Rank 1] Trainer log: {'loss': 0.6578, 'grad_norm': 7.809361934661865, 'learning_rate': 1.836717774213187e-06} [Rank 2] Trainer log: {'loss': 0.6578, 'grad_norm': 7.809361934661865, 'learning_rate': 1.836717774213187e-06}[Rank 0] Trainer log: {'loss': 0.6578, 'grad_norm': 7.809361934661865, 'learning_rate': 1.836717774213187e-06} {'loss': 0.6578, 'grad_norm': 7.809361934661865, 'learning_rate': 1.836717774213187e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.8429, 'grad_norm': 4.537125587463379, 'learning_rate': 1.8327809984885592e-06}[Rank 1] Trainer log: {'loss': 0.8429, 'grad_norm': 4.537125587463379, 'learning_rate': 1.8327809984885592e-06} [Rank 0] Trainer log: {'loss': 0.8429, 'grad_norm': 4.537125587463379, 'learning_rate': 1.8327809984885592e-06} [Rank 2] Trainer log: {'loss': 0.8429, 'grad_norm': 4.537125587463379, 'learning_rate': 1.8327809984885592e-06} {'loss': 0.8429, 'grad_norm': 4.537125587463379, 'learning_rate': 1.8327809984885592e-06, 'epoch': 0.81} [Rank 2] Trainer log: {'loss': 0.6317, 'grad_norm': 14.170788764953613, 'learning_rate': 1.8288480206112879e-06}[Rank 3] Trainer log: {'loss': 0.6317, 'grad_norm': 14.170788764953613, 'learning_rate': 1.8288480206112879e-06} [Rank 0] Trainer log: {'loss': 0.6317, 'grad_norm': 14.170788764953613, 'learning_rate': 1.8288480206112879e-06}[Rank 1] Trainer log: {'loss': 0.6317, 'grad_norm': 14.170788764953613, 'learning_rate': 1.8288480206112879e-06} {'loss': 0.6317, 'grad_norm': 14.170788764953613, 'learning_rate': 1.8288480206112879e-06, 'epoch': 0.81} [Rank 2] Trainer log: {'loss': 0.5809, 'grad_norm': 4.277110576629639, 'learning_rate': 1.8249188424102492e-06} [Rank 3] Trainer log: {'loss': 0.5809, 'grad_norm': 4.277110576629639, 'learning_rate': 1.8249188424102492e-06} [Rank 0] Trainer log: {'loss': 0.5809, 'grad_norm': 4.277110576629639, 'learning_rate': 1.8249188424102492e-06}[Rank 1] Trainer log: {'loss': 0.5809, 'grad_norm': 4.277110576629639, 'learning_rate': 1.8249188424102492e-06} {'loss': 0.5809, 'grad_norm': 4.277110576629639, 'learning_rate': 1.8249188424102492e-06, 'epoch': 0.81} [Rank 1] Trainer log: {'loss': 0.9714, 'grad_norm': 3.7254912853240967, 'learning_rate': 1.8209934657125528e-06}[Rank 0] Trainer log: {'loss': 0.9714, 'grad_norm': 3.7254912853240967, 'learning_rate': 1.8209934657125528e-06}[Rank 3] Trainer log: {'loss': 0.9714, 'grad_norm': 3.7254912853240967, 'learning_rate': 1.8209934657125528e-06} [Rank 2] Trainer log: {'loss': 0.9714, 'grad_norm': 3.7254912853240967, 'learning_rate': 1.8209934657125528e-06} {'loss': 0.9714, 'grad_norm': 3.7254912853240967, 'learning_rate': 1.8209934657125528e-06, 'epoch': 0.81} [Rank 0] Trainer log: {'loss': 0.7961, 'grad_norm': 3.4773671627044678, 'learning_rate': 1.817071892343546e-06}[Rank 1] Trainer log: {'loss': 0.7961, 'grad_norm': 3.4773671627044678, 'learning_rate': 1.817071892343546e-06}[Rank 3] Trainer log: {'loss': 0.7961, 'grad_norm': 3.4773671627044678, 'learning_rate': 1.817071892343546e-06} [Rank 2] Trainer log: {'loss': 0.7961, 'grad_norm': 3.4773671627044678, 'learning_rate': 1.817071892343546e-06} {'loss': 0.7961, 'grad_norm': 3.4773671627044678, 'learning_rate': 1.817071892343546e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 1.0424, 'grad_norm': 2.063523054122925, 'learning_rate': 1.813154124126798e-06}[Rank 1] Trainer log: {'loss': 1.0424, 'grad_norm': 2.063523054122925, 'learning_rate': 1.813154124126798e-06} [Rank 0] Trainer log: {'loss': 1.0424, 'grad_norm': 2.063523054122925, 'learning_rate': 1.813154124126798e-06} [Rank 2] Trainer log: {'loss': 1.0424, 'grad_norm': 2.063523054122925, 'learning_rate': 1.813154124126798e-06} {'loss': 1.0424, 'grad_norm': 2.063523054122925, 'learning_rate': 1.813154124126798e-06, 'epoch': 0.81} [Rank 3] Trainer log: {'loss': 0.9282, 'grad_norm': 3.280003309249878, 'learning_rate': 1.8092401628841228e-06} [Rank 0] Trainer log: {'loss': 0.9282, 'grad_norm': 3.280003309249878, 'learning_rate': 1.8092401628841228e-06}[Rank 1] Trainer log: {'loss': 0.9282, 'grad_norm': 3.280003309249878, 'learning_rate': 1.8092401628841228e-06} [Rank 2] Trainer log: {'loss': 0.9282, 'grad_norm': 3.280003309249878, 'learning_rate': 1.8092401628841228e-06} {'loss': 0.9282, 'grad_norm': 3.280003309249878, 'learning_rate': 1.8092401628841228e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.9022, 'grad_norm': 10.205897331237793, 'learning_rate': 1.8053300104355476e-06}[Rank 3] Trainer log: {'loss': 0.9022, 'grad_norm': 10.205897331237793, 'learning_rate': 1.8053300104355476e-06} [Rank 1] Trainer log: {'loss': 0.9022, 'grad_norm': 10.205897331237793, 'learning_rate': 1.8053300104355476e-06} [Rank 2] Trainer log: {'loss': 0.9022, 'grad_norm': 10.205897331237793, 'learning_rate': 1.8053300104355476e-06} {'loss': 0.9022, 'grad_norm': 10.205897331237793, 'learning_rate': 1.8053300104355476e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.827, 'grad_norm': 4.232362747192383, 'learning_rate': 1.8014236685993424e-06}[Rank 1] Trainer log: {'loss': 0.827, 'grad_norm': 4.232362747192383, 'learning_rate': 1.8014236685993424e-06}[Rank 2] Trainer log: {'loss': 0.827, 'grad_norm': 4.232362747192383, 'learning_rate': 1.8014236685993424e-06} [Rank 3] Trainer log: {'loss': 0.827, 'grad_norm': 4.232362747192383, 'learning_rate': 1.8014236685993424e-06} {'loss': 0.827, 'grad_norm': 4.232362747192383, 'learning_rate': 1.8014236685993424e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.7658, 'grad_norm': 8.514174461364746, 'learning_rate': 1.7975211391919955e-06}[Rank 2] Trainer log: {'loss': 0.7658, 'grad_norm': 8.514174461364746, 'learning_rate': 1.7975211391919955e-06}[Rank 0] Trainer log: {'loss': 0.7658, 'grad_norm': 8.514174461364746, 'learning_rate': 1.7975211391919955e-06} [Rank 3] Trainer log: {'loss': 0.7658, 'grad_norm': 8.514174461364746, 'learning_rate': 1.7975211391919955e-06} {'loss': 0.7658, 'grad_norm': 8.514174461364746, 'learning_rate': 1.7975211391919955e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.9121, 'grad_norm': 3.8440351486206055, 'learning_rate': 1.7936224240282307e-06} [Rank 1] Trainer log: {'loss': 0.9121, 'grad_norm': 3.8440351486206055, 'learning_rate': 1.7936224240282307e-06}[Rank 0] Trainer log: {'loss': 0.9121, 'grad_norm': 3.8440351486206055, 'learning_rate': 1.7936224240282307e-06} [Rank 2] Trainer log: {'loss': 0.9121, 'grad_norm': 3.8440351486206055, 'learning_rate': 1.7936224240282307e-06} {'loss': 0.9121, 'grad_norm': 3.8440351486206055, 'learning_rate': 1.7936224240282307e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.8022, 'grad_norm': 3.8285133838653564, 'learning_rate': 1.78972752492099e-06}[Rank 0] Trainer log: {'loss': 0.8022, 'grad_norm': 3.8285133838653564, 'learning_rate': 1.78972752492099e-06} [Rank 1] Trainer log: {'loss': 0.8022, 'grad_norm': 3.8285133838653564, 'learning_rate': 1.78972752492099e-06} [Rank 2] Trainer log: {'loss': 0.8022, 'grad_norm': 3.8285133838653564, 'learning_rate': 1.78972752492099e-06} {'loss': 0.8022, 'grad_norm': 3.8285133838653564, 'learning_rate': 1.78972752492099e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.9567, 'grad_norm': 5.794602870941162, 'learning_rate': 1.7858364436814446e-06} [Rank 2] Trainer log: {'loss': 0.9567, 'grad_norm': 5.794602870941162, 'learning_rate': 1.7858364436814446e-06}[Rank 0] Trainer log: {'loss': 0.9567, 'grad_norm': 5.794602870941162, 'learning_rate': 1.7858364436814446e-06} [Rank 1] Trainer log: {'loss': 0.9567, 'grad_norm': 5.794602870941162, 'learning_rate': 1.7858364436814446e-06} {'loss': 0.9567, 'grad_norm': 5.794602870941162, 'learning_rate': 1.7858364436814446e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 1.096, 'grad_norm': 5.813982009887695, 'learning_rate': 1.7819491821189939e-06} [Rank 0] Trainer log: {'loss': 1.096, 'grad_norm': 5.813982009887695, 'learning_rate': 1.7819491821189939e-06}[Rank 2] Trainer log: {'loss': 1.096, 'grad_norm': 5.813982009887695, 'learning_rate': 1.7819491821189939e-06} [Rank 1] Trainer log: {'loss': 1.096, 'grad_norm': 5.813982009887695, 'learning_rate': 1.7819491821189939e-06} {'loss': 1.096, 'grad_norm': 5.813982009887695, 'learning_rate': 1.7819491821189939e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.8112, 'grad_norm': 11.102357864379883, 'learning_rate': 1.7780657420412528e-06} [Rank 0] Trainer log: {'loss': 0.8112, 'grad_norm': 11.102357864379883, 'learning_rate': 1.7780657420412528e-06}[Rank 2] Trainer log: {'loss': 0.8112, 'grad_norm': 11.102357864379883, 'learning_rate': 1.7780657420412528e-06} [Rank 1] Trainer log: {'loss': 0.8112, 'grad_norm': 11.102357864379883, 'learning_rate': 1.7780657420412528e-06} {'loss': 0.8112, 'grad_norm': 11.102357864379883, 'learning_rate': 1.7780657420412528e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.8087, 'grad_norm': 2.867265224456787, 'learning_rate': 1.7741861252540693e-06} [Rank 3] Trainer log: {'loss': 0.8087, 'grad_norm': 2.867265224456787, 'learning_rate': 1.7741861252540693e-06}[Rank 0] Trainer log: {'loss': 0.8087, 'grad_norm': 2.867265224456787, 'learning_rate': 1.7741861252540693e-06} [Rank 2] Trainer log: {'loss': 0.8087, 'grad_norm': 2.867265224456787, 'learning_rate': 1.7741861252540693e-06} {'loss': 0.8087, 'grad_norm': 2.867265224456787, 'learning_rate': 1.7741861252540693e-06, 'epoch': 0.82} [Rank 2] Trainer log: {'loss': 0.9861, 'grad_norm': 7.046249866485596, 'learning_rate': 1.7703103335615023e-06}[Rank 3] Trainer log: {'loss': 0.9861, 'grad_norm': 7.046249866485596, 'learning_rate': 1.7703103335615023e-06}[Rank 1] Trainer log: {'loss': 0.9861, 'grad_norm': 7.046249866485596, 'learning_rate': 1.7703103335615023e-06} [Rank 0] Trainer log: {'loss': 0.9861, 'grad_norm': 7.046249866485596, 'learning_rate': 1.7703103335615023e-06} {'loss': 0.9861, 'grad_norm': 7.046249866485596, 'learning_rate': 1.7703103335615023e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.5889, 'grad_norm': 4.4706196784973145, 'learning_rate': 1.7664383687658438e-06}[Rank 3] Trainer log: {'loss': 0.5889, 'grad_norm': 4.4706196784973145, 'learning_rate': 1.7664383687658438e-06} [Rank 2] Trainer log: {'loss': 0.5889, 'grad_norm': 4.4706196784973145, 'learning_rate': 1.7664383687658438e-06} [Rank 0] Trainer log: {'loss': 0.5889, 'grad_norm': 4.4706196784973145, 'learning_rate': 1.7664383687658438e-06} {'loss': 0.5889, 'grad_norm': 4.4706196784973145, 'learning_rate': 1.7664383687658438e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.7106, 'grad_norm': 2.8935940265655518, 'learning_rate': 1.7625702326675952e-06} [Rank 0] Trainer log: {'loss': 0.7106, 'grad_norm': 2.8935940265655518, 'learning_rate': 1.7625702326675952e-06}[Rank 1] Trainer log: {'loss': 0.7106, 'grad_norm': 2.8935940265655518, 'learning_rate': 1.7625702326675952e-06}[Rank 2] Trainer log: {'loss': 0.7106, 'grad_norm': 2.8935940265655518, 'learning_rate': 1.7625702326675952e-06} {'loss': 0.7106, 'grad_norm': 2.8935940265655518, 'learning_rate': 1.7625702326675952e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.6931, 'grad_norm': 4.711644172668457, 'learning_rate': 1.758705927065487e-06}[Rank 3] Trainer log: {'loss': 0.6931, 'grad_norm': 4.711644172668457, 'learning_rate': 1.758705927065487e-06} [Rank 0] Trainer log: {'loss': 0.6931, 'grad_norm': 4.711644172668457, 'learning_rate': 1.758705927065487e-06} [Rank 2] Trainer log: {'loss': 0.6931, 'grad_norm': 4.711644172668457, 'learning_rate': 1.758705927065487e-06} {'loss': 0.6931, 'grad_norm': 4.711644172668457, 'learning_rate': 1.758705927065487e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.6492, 'grad_norm': 2.71553635597229, 'learning_rate': 1.754845453756463e-06}[Rank 0] Trainer log: {'loss': 0.6492, 'grad_norm': 2.71553635597229, 'learning_rate': 1.754845453756463e-06} [Rank 1] Trainer log: {'loss': 0.6492, 'grad_norm': 2.71553635597229, 'learning_rate': 1.754845453756463e-06} [Rank 2] Trainer log: {'loss': 0.6492, 'grad_norm': 2.71553635597229, 'learning_rate': 1.754845453756463e-06} {'loss': 0.6492, 'grad_norm': 2.71553635597229, 'learning_rate': 1.754845453756463e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.6483, 'grad_norm': 3.075631856918335, 'learning_rate': 1.7509888145356824e-06}[Rank 3] Trainer log: {'loss': 0.6483, 'grad_norm': 3.075631856918335, 'learning_rate': 1.7509888145356824e-06}[Rank 1] Trainer log: {'loss': 0.6483, 'grad_norm': 3.075631856918335, 'learning_rate': 1.7509888145356824e-06} [Rank 2] Trainer log: {'loss': 0.6483, 'grad_norm': 3.075631856918335, 'learning_rate': 1.7509888145356824e-06} {'loss': 0.6483, 'grad_norm': 3.075631856918335, 'learning_rate': 1.7509888145356824e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.7196, 'grad_norm': 4.072166442871094, 'learning_rate': 1.7471360111965308e-06}[Rank 3] Trainer log: {'loss': 0.7196, 'grad_norm': 4.072166442871094, 'learning_rate': 1.7471360111965308e-06} [Rank 1] Trainer log: {'loss': 0.7196, 'grad_norm': 4.072166442871094, 'learning_rate': 1.7471360111965308e-06}[Rank 2] Trainer log: {'loss': 0.7196, 'grad_norm': 4.072166442871094, 'learning_rate': 1.7471360111965308e-06} {'loss': 0.7196, 'grad_norm': 4.072166442871094, 'learning_rate': 1.7471360111965308e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.9749, 'grad_norm': 2.587965726852417, 'learning_rate': 1.7432870455305984e-06}[Rank 0] Trainer log: {'loss': 0.9749, 'grad_norm': 2.587965726852417, 'learning_rate': 1.7432870455305984e-06}[Rank 3] Trainer log: {'loss': 0.9749, 'grad_norm': 2.587965726852417, 'learning_rate': 1.7432870455305984e-06} [Rank 2] Trainer log: {'loss': 0.9749, 'grad_norm': 2.587965726852417, 'learning_rate': 1.7432870455305984e-06}{'loss': 0.9749, 'grad_norm': 2.587965726852417, 'learning_rate': 1.7432870455305984e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.7711, 'grad_norm': 7.7814040184021, 'learning_rate': 1.7394419193277013e-06}[Rank 3] Trainer log: {'loss': 0.7711, 'grad_norm': 7.7814040184021, 'learning_rate': 1.7394419193277013e-06} [Rank 2] Trainer log: {'loss': 0.7711, 'grad_norm': 7.7814040184021, 'learning_rate': 1.7394419193277013e-06} [Rank 1] Trainer log: {'loss': 0.7711, 'grad_norm': 7.7814040184021, 'learning_rate': 1.7394419193277013e-06} {'loss': 0.7711, 'grad_norm': 7.7814040184021, 'learning_rate': 1.7394419193277013e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.8409, 'grad_norm': 1.9973758459091187, 'learning_rate': 1.7356006343758657e-06}[Rank 0] Trainer log: {'loss': 0.8409, 'grad_norm': 1.9973758459091187, 'learning_rate': 1.7356006343758657e-06}[Rank 3] Trainer log: {'loss': 0.8409, 'grad_norm': 1.9973758459091187, 'learning_rate': 1.7356006343758657e-06} [Rank 2] Trainer log: {'loss': 0.8409, 'grad_norm': 1.9973758459091187, 'learning_rate': 1.7356006343758657e-06} {'loss': 0.8409, 'grad_norm': 1.9973758459091187, 'learning_rate': 1.7356006343758657e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.8421, 'grad_norm': 4.288811206817627, 'learning_rate': 1.731763192461332e-06}[Rank 3] Trainer log: {'loss': 0.8421, 'grad_norm': 4.288811206817627, 'learning_rate': 1.731763192461332e-06} [Rank 0] Trainer log: {'loss': 0.8421, 'grad_norm': 4.288811206817627, 'learning_rate': 1.731763192461332e-06}[Rank 2] Trainer log: {'loss': 0.8421, 'grad_norm': 4.288811206817627, 'learning_rate': 1.731763192461332e-06} {'loss': 0.8421, 'grad_norm': 4.288811206817627, 'learning_rate': 1.731763192461332e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.731, 'grad_norm': 2.4940667152404785, 'learning_rate': 1.7279295953685516e-06} [Rank 1] Trainer log: {'loss': 0.731, 'grad_norm': 2.4940667152404785, 'learning_rate': 1.7279295953685516e-06}[Rank 0] Trainer log: {'loss': 0.731, 'grad_norm': 2.4940667152404785, 'learning_rate': 1.7279295953685516e-06} [Rank 2] Trainer log: {'loss': 0.731, 'grad_norm': 2.4940667152404785, 'learning_rate': 1.7279295953685516e-06} {'loss': 0.731, 'grad_norm': 2.4940667152404785, 'learning_rate': 1.7279295953685516e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.924, 'grad_norm': 2.823662281036377, 'learning_rate': 1.724099844880187e-06}[Rank 1] Trainer log: {'loss': 0.924, 'grad_norm': 2.823662281036377, 'learning_rate': 1.724099844880187e-06}[Rank 3] Trainer log: {'loss': 0.924, 'grad_norm': 2.823662281036377, 'learning_rate': 1.724099844880187e-06} [Rank 2] Trainer log: {'loss': 0.924, 'grad_norm': 2.823662281036377, 'learning_rate': 1.724099844880187e-06} {'loss': 0.924, 'grad_norm': 2.823662281036377, 'learning_rate': 1.724099844880187e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.7105, 'grad_norm': 3.5363006591796875, 'learning_rate': 1.7202739427771197e-06}[Rank 0] Trainer log: {'loss': 0.7105, 'grad_norm': 3.5363006591796875, 'learning_rate': 1.7202739427771197e-06} [Rank 1] Trainer log: {'loss': 0.7105, 'grad_norm': 3.5363006591796875, 'learning_rate': 1.7202739427771197e-06} [Rank 2] Trainer log: {'loss': 0.7105, 'grad_norm': 3.5363006591796875, 'learning_rate': 1.7202739427771197e-06} {'loss': 0.7105, 'grad_norm': 3.5363006591796875, 'learning_rate': 1.7202739427771197e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.6108, 'grad_norm': 4.0703277587890625, 'learning_rate': 1.7164518908384353e-06}[Rank 1] Trainer log: {'loss': 0.6108, 'grad_norm': 4.0703277587890625, 'learning_rate': 1.7164518908384353e-06}[Rank 0] Trainer log: {'loss': 0.6108, 'grad_norm': 4.0703277587890625, 'learning_rate': 1.7164518908384353e-06} [Rank 2] Trainer log: {'loss': 0.6108, 'grad_norm': 4.0703277587890625, 'learning_rate': 1.7164518908384353e-06} {'loss': 0.6108, 'grad_norm': 4.0703277587890625, 'learning_rate': 1.7164518908384353e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.7555, 'grad_norm': 2.3395278453826904, 'learning_rate': 1.7126336908414266e-06}[Rank 3] Trainer log: {'loss': 0.7555, 'grad_norm': 2.3395278453826904, 'learning_rate': 1.7126336908414266e-06} [Rank 1] Trainer log: {'loss': 0.7555, 'grad_norm': 2.3395278453826904, 'learning_rate': 1.7126336908414266e-06} [Rank 2] Trainer log: {'loss': 0.7555, 'grad_norm': 2.3395278453826904, 'learning_rate': 1.7126336908414266e-06} {'loss': 0.7555, 'grad_norm': 2.3395278453826904, 'learning_rate': 1.7126336908414266e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.6404, 'grad_norm': 5.443278789520264, 'learning_rate': 1.7088193445616018e-06} [Rank 2] Trainer log: {'loss': 0.6404, 'grad_norm': 5.443278789520264, 'learning_rate': 1.7088193445616018e-06}[Rank 0] Trainer log: {'loss': 0.6404, 'grad_norm': 5.443278789520264, 'learning_rate': 1.7088193445616018e-06} [Rank 1] Trainer log: {'loss': 0.6404, 'grad_norm': 5.443278789520264, 'learning_rate': 1.7088193445616018e-06} {'loss': 0.6404, 'grad_norm': 5.443278789520264, 'learning_rate': 1.7088193445616018e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.7121, 'grad_norm': 3.7148051261901855, 'learning_rate': 1.7050088537726762e-06}[Rank 3] Trainer log: {'loss': 0.7121, 'grad_norm': 3.7148051261901855, 'learning_rate': 1.7050088537726762e-06}[Rank 2] Trainer log: {'loss': 0.7121, 'grad_norm': 3.7148051261901855, 'learning_rate': 1.7050088537726762e-06} [Rank 0] Trainer log: {'loss': 0.7121, 'grad_norm': 3.7148051261901855, 'learning_rate': 1.7050088537726762e-06} {'loss': 0.7121, 'grad_norm': 3.7148051261901855, 'learning_rate': 1.7050088537726762e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.7806, 'grad_norm': 4.183579921722412, 'learning_rate': 1.7012022202465661e-06}[Rank 1] Trainer log: {'loss': 0.7806, 'grad_norm': 4.183579921722412, 'learning_rate': 1.7012022202465661e-06}[Rank 0] Trainer log: {'loss': 0.7806, 'grad_norm': 4.183579921722412, 'learning_rate': 1.7012022202465661e-06} [Rank 2] Trainer log: {'loss': 0.7806, 'grad_norm': 4.183579921722412, 'learning_rate': 1.7012022202465661e-06} {'loss': 0.7806, 'grad_norm': 4.183579921722412, 'learning_rate': 1.7012022202465661e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.8135, 'grad_norm': 4.410374164581299, 'learning_rate': 1.6973994457534026e-06}[Rank 3] Trainer log: {'loss': 0.8135, 'grad_norm': 4.410374164581299, 'learning_rate': 1.6973994457534026e-06} [Rank 2] Trainer log: {'loss': 0.8135, 'grad_norm': 4.410374164581299, 'learning_rate': 1.6973994457534026e-06} [Rank 0] Trainer log: {'loss': 0.8135, 'grad_norm': 4.410374164581299, 'learning_rate': 1.6973994457534026e-06} {'loss': 0.8135, 'grad_norm': 4.410374164581299, 'learning_rate': 1.6973994457534026e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.6714, 'grad_norm': 2.3650946617126465, 'learning_rate': 1.6936005320615156e-06}[Rank 3] Trainer log: {'loss': 0.6714, 'grad_norm': 2.3650946617126465, 'learning_rate': 1.6936005320615156e-06}[Rank 0] Trainer log: {'loss': 0.6714, 'grad_norm': 2.3650946617126465, 'learning_rate': 1.6936005320615156e-06} [Rank 2] Trainer log: {'loss': 0.6714, 'grad_norm': 2.3650946617126465, 'learning_rate': 1.6936005320615156e-06} {'loss': 0.6714, 'grad_norm': 2.3650946617126465, 'learning_rate': 1.6936005320615156e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.8956, 'grad_norm': 6.014004230499268, 'learning_rate': 1.6898054809374398e-06}[Rank 1] Trainer log: {'loss': 0.8956, 'grad_norm': 6.014004230499268, 'learning_rate': 1.6898054809374398e-06}[Rank 2] Trainer log: {'loss': 0.8956, 'grad_norm': 6.014004230499268, 'learning_rate': 1.6898054809374398e-06} [Rank 3] Trainer log: {'loss': 0.8956, 'grad_norm': 6.014004230499268, 'learning_rate': 1.6898054809374398e-06} {'loss': 0.8956, 'grad_norm': 6.014004230499268, 'learning_rate': 1.6898054809374398e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 0.7769, 'grad_norm': 1.9560585021972656, 'learning_rate': 1.6860142941459211e-06}[Rank 3] Trainer log: {'loss': 0.7769, 'grad_norm': 1.9560585021972656, 'learning_rate': 1.6860142941459211e-06} [Rank 1] Trainer log: {'loss': 0.7769, 'grad_norm': 1.9560585021972656, 'learning_rate': 1.6860142941459211e-06} [Rank 2] Trainer log: {'loss': 0.7769, 'grad_norm': 1.9560585021972656, 'learning_rate': 1.6860142941459211e-06} {'loss': 0.7769, 'grad_norm': 1.9560585021972656, 'learning_rate': 1.6860142941459211e-06, 'epoch': 0.82} [Rank 2] Trainer log: {'loss': 0.8741, 'grad_norm': 2.519016981124878, 'learning_rate': 1.6822269734499008e-06} [Rank 0] Trainer log: {'loss': 0.8741, 'grad_norm': 2.519016981124878, 'learning_rate': 1.6822269734499008e-06}[Rank 1] Trainer log: {'loss': 0.8741, 'grad_norm': 2.519016981124878, 'learning_rate': 1.6822269734499008e-06} [Rank 3] Trainer log: {'loss': 0.8741, 'grad_norm': 2.519016981124878, 'learning_rate': 1.6822269734499008e-06} {'loss': 0.8741, 'grad_norm': 2.519016981124878, 'learning_rate': 1.6822269734499008e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.8379, 'grad_norm': 4.699592590332031, 'learning_rate': 1.6784435206105231e-06} [Rank 2] Trainer log: {'loss': 0.8379, 'grad_norm': 4.699592590332031, 'learning_rate': 1.6784435206105231e-06} [Rank 1] Trainer log: {'loss': 0.8379, 'grad_norm': 4.699592590332031, 'learning_rate': 1.6784435206105231e-06} [Rank 0] Trainer log: {'loss': 0.8379, 'grad_norm': 4.699592590332031, 'learning_rate': 1.6784435206105231e-06} {'loss': 0.8379, 'grad_norm': 4.699592590332031, 'learning_rate': 1.6784435206105231e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.9194, 'grad_norm': 1.9539737701416016, 'learning_rate': 1.6746639373871366e-06} [Rank 1] Trainer log: {'loss': 0.9194, 'grad_norm': 1.9539737701416016, 'learning_rate': 1.6746639373871366e-06} [Rank 0] Trainer log: {'loss': 0.9194, 'grad_norm': 1.9539737701416016, 'learning_rate': 1.6746639373871366e-06}[Rank 2] Trainer log: {'loss': 0.9194, 'grad_norm': 1.9539737701416016, 'learning_rate': 1.6746639373871366e-06} {'loss': 0.9194, 'grad_norm': 1.9539737701416016, 'learning_rate': 1.6746639373871366e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.7816, 'grad_norm': 2.38702654838562, 'learning_rate': 1.6708882255372926e-06}[Rank 2] Trainer log: {'loss': 0.7816, 'grad_norm': 2.38702654838562, 'learning_rate': 1.6708882255372926e-06} [Rank 1] Trainer log: {'loss': 0.7816, 'grad_norm': 2.38702654838562, 'learning_rate': 1.6708882255372926e-06} [Rank 0] Trainer log: {'loss': 0.7816, 'grad_norm': 2.38702654838562, 'learning_rate': 1.6708882255372926e-06} {'loss': 0.7816, 'grad_norm': 2.38702654838562, 'learning_rate': 1.6708882255372926e-06, 'epoch': 0.82} [Rank 2] Trainer log: {'loss': 0.9624, 'grad_norm': 5.815980434417725, 'learning_rate': 1.667116386816734e-06} [Rank 1] Trainer log: {'loss': 0.9624, 'grad_norm': 5.815980434417725, 'learning_rate': 1.667116386816734e-06}[Rank 0] Trainer log: {'loss': 0.9624, 'grad_norm': 5.815980434417725, 'learning_rate': 1.667116386816734e-06} [Rank 3] Trainer log: {'loss': 0.9624, 'grad_norm': 5.815980434417725, 'learning_rate': 1.667116386816734e-06} {'loss': 0.9624, 'grad_norm': 5.815980434417725, 'learning_rate': 1.667116386816734e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.8371, 'grad_norm': 9.010271072387695, 'learning_rate': 1.6633484229794116e-06} [Rank 0] Trainer log: {'loss': 0.8371, 'grad_norm': 9.010271072387695, 'learning_rate': 1.6633484229794116e-06}[Rank 3] Trainer log: {'loss': 0.8371, 'grad_norm': 9.010271072387695, 'learning_rate': 1.6633484229794116e-06} [Rank 2] Trainer log: {'loss': 0.8371, 'grad_norm': 9.010271072387695, 'learning_rate': 1.6633484229794116e-06} {'loss': 0.8371, 'grad_norm': 9.010271072387695, 'learning_rate': 1.6633484229794116e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.697, 'grad_norm': 1.8555835485458374, 'learning_rate': 1.65958433577747e-06}[Rank 0] Trainer log: {'loss': 0.697, 'grad_norm': 1.8555835485458374, 'learning_rate': 1.65958433577747e-06} [Rank 1] Trainer log: {'loss': 0.697, 'grad_norm': 1.8555835485458374, 'learning_rate': 1.65958433577747e-06} [Rank 2] Trainer log: {'loss': 0.697, 'grad_norm': 1.8555835485458374, 'learning_rate': 1.65958433577747e-06} {'loss': 0.697, 'grad_norm': 1.8555835485458374, 'learning_rate': 1.65958433577747e-06, 'epoch': 0.82} [Rank 0] Trainer log: {'loss': 1.0199, 'grad_norm': 3.7452571392059326, 'learning_rate': 1.6558241269612485e-06}[Rank 3] Trainer log: {'loss': 1.0199, 'grad_norm': 3.7452571392059326, 'learning_rate': 1.6558241269612485e-06}[Rank 1] Trainer log: {'loss': 1.0199, 'grad_norm': 3.7452571392059326, 'learning_rate': 1.6558241269612485e-06} [Rank 2] Trainer log: {'loss': 1.0199, 'grad_norm': 3.7452571392059326, 'learning_rate': 1.6558241269612485e-06} {'loss': 1.0199, 'grad_norm': 3.7452571392059326, 'learning_rate': 1.6558241269612485e-06, 'epoch': 0.82} [Rank 2] Trainer log: {'loss': 0.7633, 'grad_norm': 3.5342190265655518, 'learning_rate': 1.6520677982792887e-06}[Rank 3] Trainer log: {'loss': 0.7633, 'grad_norm': 3.5342190265655518, 'learning_rate': 1.6520677982792887e-06}[Rank 1] Trainer log: {'loss': 0.7633, 'grad_norm': 3.5342190265655518, 'learning_rate': 1.6520677982792887e-06} [Rank 0] Trainer log: {'loss': 0.7633, 'grad_norm': 3.5342190265655518, 'learning_rate': 1.6520677982792887e-06} {'loss': 0.7633, 'grad_norm': 3.5342190265655518, 'learning_rate': 1.6520677982792887e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.6607, 'grad_norm': 11.837660789489746, 'learning_rate': 1.6483153514783246e-06}[Rank 0] Trainer log: {'loss': 0.6607, 'grad_norm': 11.837660789489746, 'learning_rate': 1.6483153514783246e-06}[Rank 2] Trainer log: {'loss': 0.6607, 'grad_norm': 11.837660789489746, 'learning_rate': 1.6483153514783246e-06} [Rank 1] Trainer log: {'loss': 0.6607, 'grad_norm': 11.837660789489746, 'learning_rate': 1.6483153514783246e-06} {'loss': 0.6607, 'grad_norm': 11.837660789489746, 'learning_rate': 1.6483153514783246e-06, 'epoch': 0.82} [Rank 3] Trainer log: {'loss': 0.8145, 'grad_norm': 3.518101930618286, 'learning_rate': 1.6445667883032824e-06} [Rank 2] Trainer log: {'loss': 0.8145, 'grad_norm': 3.518101930618286, 'learning_rate': 1.6445667883032824e-06} [Rank 1] Trainer log: {'loss': 0.8145, 'grad_norm': 3.518101930618286, 'learning_rate': 1.6445667883032824e-06} [Rank 0] Trainer log: {'loss': 0.8145, 'grad_norm': 3.518101930618286, 'learning_rate': 1.6445667883032824e-06} {'loss': 0.8145, 'grad_norm': 3.518101930618286, 'learning_rate': 1.6445667883032824e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.7457, 'grad_norm': 4.4516282081604, 'learning_rate': 1.6408221104972888e-06}[Rank 3] Trainer log: {'loss': 0.7457, 'grad_norm': 4.4516282081604, 'learning_rate': 1.6408221104972888e-06} [Rank 0] Trainer log: {'loss': 0.7457, 'grad_norm': 4.4516282081604, 'learning_rate': 1.6408221104972888e-06}[Rank 2] Trainer log: {'loss': 0.7457, 'grad_norm': 4.4516282081604, 'learning_rate': 1.6408221104972888e-06} {'loss': 0.7457, 'grad_norm': 4.4516282081604, 'learning_rate': 1.6408221104972888e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.8393, 'grad_norm': 5.241701602935791, 'learning_rate': 1.637081319801661e-06}[Rank 0] Trainer log: {'loss': 0.8393, 'grad_norm': 5.241701602935791, 'learning_rate': 1.637081319801661e-06}[Rank 3] Trainer log: {'loss': 0.8393, 'grad_norm': 5.241701602935791, 'learning_rate': 1.637081319801661e-06} [Rank 2] Trainer log: {'loss': 0.8393, 'grad_norm': 5.241701602935791, 'learning_rate': 1.637081319801661e-06} {'loss': 0.8393, 'grad_norm': 5.241701602935791, 'learning_rate': 1.637081319801661e-06, 'epoch': 0.82} [Rank 2] Trainer log: {'loss': 0.9841, 'grad_norm': 2.8268332481384277, 'learning_rate': 1.6333444179559078e-06} [Rank 3] Trainer log: {'loss': 0.9841, 'grad_norm': 2.8268332481384277, 'learning_rate': 1.6333444179559078e-06} [Rank 0] Trainer log: {'loss': 0.9841, 'grad_norm': 2.8268332481384277, 'learning_rate': 1.6333444179559078e-06} [Rank 1] Trainer log: {'loss': 0.9841, 'grad_norm': 2.8268332481384277, 'learning_rate': 1.6333444179559078e-06} {'loss': 0.9841, 'grad_norm': 2.8268332481384277, 'learning_rate': 1.6333444179559078e-06, 'epoch': 0.82} [Rank 1] Trainer log: {'loss': 0.7182, 'grad_norm': 2.9055163860321045, 'learning_rate': 1.6296114066977264e-06}[Rank 3] Trainer log: {'loss': 0.7182, 'grad_norm': 2.9055163860321045, 'learning_rate': 1.6296114066977264e-06} [Rank 0] Trainer log: {'loss': 0.7182, 'grad_norm': 2.9055163860321045, 'learning_rate': 1.6296114066977264e-06}[Rank 2] Trainer log: {'loss': 0.7182, 'grad_norm': 2.9055163860321045, 'learning_rate': 1.6296114066977264e-06} {'loss': 0.7182, 'grad_norm': 2.9055163860321045, 'learning_rate': 1.6296114066977264e-06, 'epoch': 0.82} [Rank 2] Trainer log: {'loss': 1.0208, 'grad_norm': 2.784768581390381, 'learning_rate': 1.6258822877630142e-06}[Rank 3] Trainer log: {'loss': 1.0208, 'grad_norm': 2.784768581390381, 'learning_rate': 1.6258822877630142e-06}[Rank 1] Trainer log: {'loss': 1.0208, 'grad_norm': 2.784768581390381, 'learning_rate': 1.6258822877630142e-06} [Rank 0] Trainer log: {'loss': 1.0208, 'grad_norm': 2.784768581390381, 'learning_rate': 1.6258822877630142e-06} {'loss': 1.0208, 'grad_norm': 2.784768581390381, 'learning_rate': 1.6258822877630142e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.9333, 'grad_norm': 2.4634182453155518, 'learning_rate': 1.6221570628858484e-06}[Rank 3] Trainer log: {'loss': 0.9333, 'grad_norm': 2.4634182453155518, 'learning_rate': 1.6221570628858484e-06} [Rank 2] Trainer log: {'loss': 0.9333, 'grad_norm': 2.4634182453155518, 'learning_rate': 1.6221570628858484e-06}[Rank 1] Trainer log: {'loss': 0.9333, 'grad_norm': 2.4634182453155518, 'learning_rate': 1.6221570628858484e-06} {'loss': 0.9333, 'grad_norm': 2.4634182453155518, 'learning_rate': 1.6221570628858484e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.7751, 'grad_norm': 3.0164854526519775, 'learning_rate': 1.6184357337985002e-06} [Rank 1] Trainer log: {'loss': 0.7751, 'grad_norm': 3.0164854526519775, 'learning_rate': 1.6184357337985002e-06}[Rank 0] Trainer log: {'loss': 0.7751, 'grad_norm': 3.0164854526519775, 'learning_rate': 1.6184357337985002e-06} [Rank 2] Trainer log: {'loss': 0.7751, 'grad_norm': 3.0164854526519775, 'learning_rate': 1.6184357337985002e-06} {'loss': 0.7751, 'grad_norm': 3.0164854526519775, 'learning_rate': 1.6184357337985002e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.6397, 'grad_norm': 12.634881019592285, 'learning_rate': 1.6147183022314317e-06}[Rank 1] Trainer log: {'loss': 0.6397, 'grad_norm': 12.634881019592285, 'learning_rate': 1.6147183022314317e-06} [Rank 2] Trainer log: {'loss': 0.6397, 'grad_norm': 12.634881019592285, 'learning_rate': 1.6147183022314317e-06} [Rank 0] Trainer log: {'loss': 0.6397, 'grad_norm': 12.634881019592285, 'learning_rate': 1.6147183022314317e-06} {'loss': 0.6397, 'grad_norm': 12.634881019592285, 'learning_rate': 1.6147183022314317e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.9304, 'grad_norm': 4.062161445617676, 'learning_rate': 1.6110047699132846e-06}[Rank 3] Trainer log: {'loss': 0.9304, 'grad_norm': 4.062161445617676, 'learning_rate': 1.6110047699132846e-06}[Rank 1] Trainer log: {'loss': 0.9304, 'grad_norm': 4.062161445617676, 'learning_rate': 1.6110047699132846e-06} [Rank 2] Trainer log: {'loss': 0.9304, 'grad_norm': 4.062161445617676, 'learning_rate': 1.6110047699132846e-06} {'loss': 0.9304, 'grad_norm': 4.062161445617676, 'learning_rate': 1.6110047699132846e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.7028, 'grad_norm': 8.929254531860352, 'learning_rate': 1.6072951385708968e-06} [Rank 0] Trainer log: {'loss': 0.7028, 'grad_norm': 8.929254531860352, 'learning_rate': 1.6072951385708968e-06}[Rank 1] Trainer log: {'loss': 0.7028, 'grad_norm': 8.929254531860352, 'learning_rate': 1.6072951385708968e-06} [Rank 2] Trainer log: {'loss': 0.7028, 'grad_norm': 8.929254531860352, 'learning_rate': 1.6072951385708968e-06} {'loss': 0.7028, 'grad_norm': 8.929254531860352, 'learning_rate': 1.6072951385708968e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.7947, 'grad_norm': 4.783249378204346, 'learning_rate': 1.6035894099292904e-06} [Rank 0] Trainer log: {'loss': 0.7947, 'grad_norm': 4.783249378204346, 'learning_rate': 1.6035894099292904e-06}[Rank 3] Trainer log: {'loss': 0.7947, 'grad_norm': 4.783249378204346, 'learning_rate': 1.6035894099292904e-06} [Rank 2] Trainer log: {'loss': 0.7947, 'grad_norm': 4.783249378204346, 'learning_rate': 1.6035894099292904e-06} {'loss': 0.7947, 'grad_norm': 4.783249378204346, 'learning_rate': 1.6035894099292904e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.858, 'grad_norm': 6.115985870361328, 'learning_rate': 1.5998875857116657e-06}[Rank 1] Trainer log: {'loss': 0.858, 'grad_norm': 6.115985870361328, 'learning_rate': 1.5998875857116657e-06}[Rank 3] Trainer log: {'loss': 0.858, 'grad_norm': 6.115985870361328, 'learning_rate': 1.5998875857116657e-06} [Rank 2] Trainer log: {'loss': 0.858, 'grad_norm': 6.115985870361328, 'learning_rate': 1.5998875857116657e-06} {'loss': 0.858, 'grad_norm': 6.115985870361328, 'learning_rate': 1.5998875857116657e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.667, 'grad_norm': 15.510725975036621, 'learning_rate': 1.5961896676394118e-06}[Rank 3] Trainer log: {'loss': 0.667, 'grad_norm': 15.510725975036621, 'learning_rate': 1.5961896676394118e-06} [Rank 1] Trainer log: {'loss': 0.667, 'grad_norm': 15.510725975036621, 'learning_rate': 1.5961896676394118e-06} {'loss': 0.667, 'grad_norm': 15.510725975036621, 'learning_rate': 1.5961896676394118e-06, 'epoch': 0.83}[Rank 2] Trainer log: {'loss': 0.667, 'grad_norm': 15.510725975036621, 'learning_rate': 1.5961896676394118e-06} [Rank 3] Trainer log: {'loss': 0.5477, 'grad_norm': 18.955764770507812, 'learning_rate': 1.592495657432105e-06} [Rank 2] Trainer log: {'loss': 0.5477, 'grad_norm': 18.955764770507812, 'learning_rate': 1.592495657432105e-06} [Rank 0] Trainer log: {'loss': 0.5477, 'grad_norm': 18.955764770507812, 'learning_rate': 1.592495657432105e-06}[Rank 1] Trainer log: {'loss': 0.5477, 'grad_norm': 18.955764770507812, 'learning_rate': 1.592495657432105e-06} {'loss': 0.5477, 'grad_norm': 18.955764770507812, 'learning_rate': 1.592495657432105e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.9922, 'grad_norm': 2.7430942058563232, 'learning_rate': 1.588805556807499e-06}[Rank 2] Trainer log: {'loss': 0.9922, 'grad_norm': 2.7430942058563232, 'learning_rate': 1.588805556807499e-06}[Rank 0] Trainer log: {'loss': 0.9922, 'grad_norm': 2.7430942058563232, 'learning_rate': 1.588805556807499e-06} [Rank 3] Trainer log: {'loss': 0.9922, 'grad_norm': 2.7430942058563232, 'learning_rate': 1.588805556807499e-06} {'loss': 0.9922, 'grad_norm': 2.7430942058563232, 'learning_rate': 1.588805556807499e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.751, 'grad_norm': 5.514708518981934, 'learning_rate': 1.585119367481529e-06}[Rank 3] Trainer log: {'loss': 0.751, 'grad_norm': 5.514708518981934, 'learning_rate': 1.585119367481529e-06}[Rank 1] Trainer log: {'loss': 0.751, 'grad_norm': 5.514708518981934, 'learning_rate': 1.585119367481529e-06} [Rank 2] Trainer log: {'loss': 0.751, 'grad_norm': 5.514708518981934, 'learning_rate': 1.585119367481529e-06} {'loss': 0.751, 'grad_norm': 5.514708518981934, 'learning_rate': 1.585119367481529e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.676, 'grad_norm': 2.893368721008301, 'learning_rate': 1.5814370911683196e-06} [Rank 0] Trainer log: {'loss': 0.676, 'grad_norm': 2.893368721008301, 'learning_rate': 1.5814370911683196e-06}[Rank 3] Trainer log: {'loss': 0.676, 'grad_norm': 2.893368721008301, 'learning_rate': 1.5814370911683196e-06} [Rank 2] Trainer log: {'loss': 0.676, 'grad_norm': 2.893368721008301, 'learning_rate': 1.5814370911683196e-06} {'loss': 0.676, 'grad_norm': 2.893368721008301, 'learning_rate': 1.5814370911683196e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.7099, 'grad_norm': 5.251486778259277, 'learning_rate': 1.5777587295801632e-06}[Rank 2] Trainer log: {'loss': 0.7099, 'grad_norm': 5.251486778259277, 'learning_rate': 1.5777587295801632e-06}[Rank 1] Trainer log: {'loss': 0.7099, 'grad_norm': 5.251486778259277, 'learning_rate': 1.5777587295801632e-06} [Rank 0] Trainer log: {'loss': 0.7099, 'grad_norm': 5.251486778259277, 'learning_rate': 1.5777587295801632e-06} {'loss': 0.7099, 'grad_norm': 5.251486778259277, 'learning_rate': 1.5777587295801632e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.8791, 'grad_norm': 5.984777450561523, 'learning_rate': 1.574084284427544e-06}[Rank 2] Trainer log: {'loss': 0.8791, 'grad_norm': 5.984777450561523, 'learning_rate': 1.574084284427544e-06} [Rank 0] Trainer log: {'loss': 0.8791, 'grad_norm': 5.984777450561523, 'learning_rate': 1.574084284427544e-06}[Rank 3] Trainer log: {'loss': 0.8791, 'grad_norm': 5.984777450561523, 'learning_rate': 1.574084284427544e-06} {'loss': 0.8791, 'grad_norm': 5.984777450561523, 'learning_rate': 1.574084284427544e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.7301, 'grad_norm': 8.106928825378418, 'learning_rate': 1.5704137574191202e-06}[Rank 2] Trainer log: {'loss': 0.7301, 'grad_norm': 8.106928825378418, 'learning_rate': 1.5704137574191202e-06}[Rank 0] Trainer log: {'loss': 0.7301, 'grad_norm': 8.106928825378418, 'learning_rate': 1.5704137574191202e-06} [Rank 1] Trainer log: {'loss': 0.7301, 'grad_norm': 8.106928825378418, 'learning_rate': 1.5704137574191202e-06} {'loss': 0.7301, 'grad_norm': 8.106928825378418, 'learning_rate': 1.5704137574191202e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.6183, 'grad_norm': 8.940073013305664, 'learning_rate': 1.5667471502617248e-06}[Rank 3] Trainer log: {'loss': 0.6183, 'grad_norm': 8.940073013305664, 'learning_rate': 1.5667471502617248e-06}[Rank 1] Trainer log: {'loss': 0.6183, 'grad_norm': 8.940073013305664, 'learning_rate': 1.5667471502617248e-06} [Rank 2] Trainer log: {'loss': 0.6183, 'grad_norm': 8.940073013305664, 'learning_rate': 1.5667471502617248e-06} {'loss': 0.6183, 'grad_norm': 8.940073013305664, 'learning_rate': 1.5667471502617248e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.7418, 'grad_norm': 3.613008499145508, 'learning_rate': 1.5630844646603694e-06}[Rank 0] Trainer log: {'loss': 0.7418, 'grad_norm': 3.613008499145508, 'learning_rate': 1.5630844646603694e-06} [Rank 1] Trainer log: {'loss': 0.7418, 'grad_norm': 3.613008499145508, 'learning_rate': 1.5630844646603694e-06} [Rank 2] Trainer log: {'loss': 0.7418, 'grad_norm': 3.613008499145508, 'learning_rate': 1.5630844646603694e-06} {'loss': 0.7418, 'grad_norm': 3.613008499145508, 'learning_rate': 1.5630844646603694e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.9507, 'grad_norm': 5.7991132736206055, 'learning_rate': 1.559425702318248e-06} [Rank 3] Trainer log: {'loss': 0.9507, 'grad_norm': 5.7991132736206055, 'learning_rate': 1.559425702318248e-06}[Rank 2] Trainer log: {'loss': 0.9507, 'grad_norm': 5.7991132736206055, 'learning_rate': 1.559425702318248e-06} [Rank 0] Trainer log: {'loss': 0.9507, 'grad_norm': 5.7991132736206055, 'learning_rate': 1.559425702318248e-06} {'loss': 0.9507, 'grad_norm': 5.7991132736206055, 'learning_rate': 1.559425702318248e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.4868, 'grad_norm': 12.478348731994629, 'learning_rate': 1.5557708649367231e-06} [Rank 3] Trainer log: {'loss': 0.4868, 'grad_norm': 12.478348731994629, 'learning_rate': 1.5557708649367231e-06} [Rank 0] Trainer log: {'loss': 0.4868, 'grad_norm': 12.478348731994629, 'learning_rate': 1.5557708649367231e-06} [Rank 2] Trainer log: {'loss': 0.4868, 'grad_norm': 12.478348731994629, 'learning_rate': 1.5557708649367231e-06} {'loss': 0.4868, 'grad_norm': 12.478348731994629, 'learning_rate': 1.5557708649367231e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.6882, 'grad_norm': 5.4627885818481445, 'learning_rate': 1.552119954215332e-06}[Rank 3] Trainer log: {'loss': 0.6882, 'grad_norm': 5.4627885818481445, 'learning_rate': 1.552119954215332e-06}[Rank 1] Trainer log: {'loss': 0.6882, 'grad_norm': 5.4627885818481445, 'learning_rate': 1.552119954215332e-06} [Rank 2] Trainer log: {'loss': 0.6882, 'grad_norm': 5.4627885818481445, 'learning_rate': 1.552119954215332e-06} {'loss': 0.6882, 'grad_norm': 5.4627885818481445, 'learning_rate': 1.552119954215332e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 1.0248, 'grad_norm': 1.8125240802764893, 'learning_rate': 1.5484729718517943e-06}[Rank 3] Trainer log: {'loss': 1.0248, 'grad_norm': 1.8125240802764893, 'learning_rate': 1.5484729718517943e-06} [Rank 2] Trainer log: {'loss': 1.0248, 'grad_norm': 1.8125240802764893, 'learning_rate': 1.5484729718517943e-06} [Rank 0] Trainer log: {'loss': 1.0248, 'grad_norm': 1.8125240802764893, 'learning_rate': 1.5484729718517943e-06} {'loss': 1.0248, 'grad_norm': 1.8125240802764893, 'learning_rate': 1.5484729718517943e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.6057, 'grad_norm': 8.601547241210938, 'learning_rate': 1.5448299195419914e-06}[Rank 2] Trainer log: {'loss': 0.6057, 'grad_norm': 8.601547241210938, 'learning_rate': 1.5448299195419914e-06}[Rank 0] Trainer log: {'loss': 0.6057, 'grad_norm': 8.601547241210938, 'learning_rate': 1.5448299195419914e-06} [Rank 1] Trainer log: {'loss': 0.6057, 'grad_norm': 8.601547241210938, 'learning_rate': 1.5448299195419914e-06} {'loss': 0.6057, 'grad_norm': 8.601547241210938, 'learning_rate': 1.5448299195419914e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.9624, 'grad_norm': 5.2669758796691895, 'learning_rate': 1.5411907989799856e-06}[Rank 0] Trainer log: {'loss': 0.9624, 'grad_norm': 5.2669758796691895, 'learning_rate': 1.5411907989799856e-06} [Rank 1] Trainer log: {'loss': 0.9624, 'grad_norm': 5.2669758796691895, 'learning_rate': 1.5411907989799856e-06} [Rank 2] Trainer log: {'loss': 0.9624, 'grad_norm': 5.2669758796691895, 'learning_rate': 1.5411907989799856e-06} {'loss': 0.9624, 'grad_norm': 5.2669758796691895, 'learning_rate': 1.5411907989799856e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.9102, 'grad_norm': 7.274292945861816, 'learning_rate': 1.5375556118580115e-06} [Rank 2] Trainer log: {'loss': 0.9102, 'grad_norm': 7.274292945861816, 'learning_rate': 1.5375556118580115e-06} [Rank 0] Trainer log: {'loss': 0.9102, 'grad_norm': 7.274292945861816, 'learning_rate': 1.5375556118580115e-06}[Rank 3] Trainer log: {'loss': 0.9102, 'grad_norm': 7.274292945861816, 'learning_rate': 1.5375556118580115e-06} {'loss': 0.9102, 'grad_norm': 7.274292945861816, 'learning_rate': 1.5375556118580115e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.746, 'grad_norm': 4.451104164123535, 'learning_rate': 1.5339243598664677e-06}[Rank 3] Trainer log: {'loss': 0.746, 'grad_norm': 4.451104164123535, 'learning_rate': 1.5339243598664677e-06}[Rank 0] Trainer log: {'loss': 0.746, 'grad_norm': 4.451104164123535, 'learning_rate': 1.5339243598664677e-06} [Rank 1] Trainer log: {'loss': 0.746, 'grad_norm': 4.451104164123535, 'learning_rate': 1.5339243598664677e-06} {'loss': 0.746, 'grad_norm': 4.451104164123535, 'learning_rate': 1.5339243598664677e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.7945, 'grad_norm': 3.695436954498291, 'learning_rate': 1.530297044693928e-06}[Rank 0] Trainer log: {'loss': 0.7945, 'grad_norm': 3.695436954498291, 'learning_rate': 1.530297044693928e-06} [Rank 1] Trainer log: {'loss': 0.7945, 'grad_norm': 3.695436954498291, 'learning_rate': 1.530297044693928e-06} [Rank 3] Trainer log: {'loss': 0.7945, 'grad_norm': 3.695436954498291, 'learning_rate': 1.530297044693928e-06} {'loss': 0.7945, 'grad_norm': 3.695436954498291, 'learning_rate': 1.530297044693928e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.7103, 'grad_norm': 2.274041175842285, 'learning_rate': 1.52667366802713e-06}[Rank 1] Trainer log: {'loss': 0.7103, 'grad_norm': 2.274041175842285, 'learning_rate': 1.52667366802713e-06} [Rank 0] Trainer log: {'loss': 0.7103, 'grad_norm': 2.274041175842285, 'learning_rate': 1.52667366802713e-06} [Rank 2] Trainer log: {'loss': 0.7103, 'grad_norm': 2.274041175842285, 'learning_rate': 1.52667366802713e-06} {'loss': 0.7103, 'grad_norm': 2.274041175842285, 'learning_rate': 1.52667366802713e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.9308, 'grad_norm': 2.6815853118896484, 'learning_rate': 1.523054231550991e-06}[Rank 3] Trainer log: {'loss': 0.9308, 'grad_norm': 2.6815853118896484, 'learning_rate': 1.523054231550991e-06} [Rank 0] Trainer log: {'loss': 0.9308, 'grad_norm': 2.6815853118896484, 'learning_rate': 1.523054231550991e-06} [Rank 2] Trainer log: {'loss': 0.9308, 'grad_norm': 2.6815853118896484, 'learning_rate': 1.523054231550991e-06} {'loss': 0.9308, 'grad_norm': 2.6815853118896484, 'learning_rate': 1.523054231550991e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 1.0431, 'grad_norm': 3.711669445037842, 'learning_rate': 1.519438736948583e-06}[Rank 3] Trainer log: {'loss': 1.0431, 'grad_norm': 3.711669445037842, 'learning_rate': 1.519438736948583e-06}[Rank 0] Trainer log: {'loss': 1.0431, 'grad_norm': 3.711669445037842, 'learning_rate': 1.519438736948583e-06} [Rank 2] Trainer log: {'loss': 1.0431, 'grad_norm': 3.711669445037842, 'learning_rate': 1.519438736948583e-06} {'loss': 1.0431, 'grad_norm': 3.711669445037842, 'learning_rate': 1.519438736948583e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.8946, 'grad_norm': 2.551093816757202, 'learning_rate': 1.5158271859011554e-06}[Rank 3] Trainer log: {'loss': 0.8946, 'grad_norm': 2.551093816757202, 'learning_rate': 1.5158271859011554e-06} [Rank 1] Trainer log: {'loss': 0.8946, 'grad_norm': 2.551093816757202, 'learning_rate': 1.5158271859011554e-06} [Rank 0] Trainer log: {'loss': 0.8946, 'grad_norm': 2.551093816757202, 'learning_rate': 1.5158271859011554e-06} {'loss': 0.8946, 'grad_norm': 2.551093816757202, 'learning_rate': 1.5158271859011554e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.7653, 'grad_norm': 2.075930118560791, 'learning_rate': 1.5122195800881156e-06} [Rank 0] Trainer log: {'loss': 0.7653, 'grad_norm': 2.075930118560791, 'learning_rate': 1.5122195800881156e-06}[Rank 1] Trainer log: {'loss': 0.7653, 'grad_norm': 2.075930118560791, 'learning_rate': 1.5122195800881156e-06} [Rank 2] Trainer log: {'loss': 0.7653, 'grad_norm': 2.075930118560791, 'learning_rate': 1.5122195800881156e-06} {'loss': 0.7653, 'grad_norm': 2.075930118560791, 'learning_rate': 1.5122195800881156e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.813, 'grad_norm': 5.00932502746582, 'learning_rate': 1.5086159211870445e-06}[Rank 3] Trainer log: {'loss': 0.813, 'grad_norm': 5.00932502746582, 'learning_rate': 1.5086159211870445e-06} [Rank 2] Trainer log: {'loss': 0.813, 'grad_norm': 5.00932502746582, 'learning_rate': 1.5086159211870445e-06} [Rank 1] Trainer log: {'loss': 0.813, 'grad_norm': 5.00932502746582, 'learning_rate': 1.5086159211870445e-06} {'loss': 0.813, 'grad_norm': 5.00932502746582, 'learning_rate': 1.5086159211870445e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.5705, 'grad_norm': 3.0220367908477783, 'learning_rate': 1.5050162108736788e-06}[Rank 3] Trainer log: {'loss': 0.5705, 'grad_norm': 3.0220367908477783, 'learning_rate': 1.5050162108736788e-06} [Rank 0] Trainer log: {'loss': 0.5705, 'grad_norm': 3.0220367908477783, 'learning_rate': 1.5050162108736788e-06} [Rank 2] Trainer log: {'loss': 0.5705, 'grad_norm': 3.0220367908477783, 'learning_rate': 1.5050162108736788e-06} {'loss': 0.5705, 'grad_norm': 3.0220367908477783, 'learning_rate': 1.5050162108736788e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.9162, 'grad_norm': 2.7622604370117188, 'learning_rate': 1.5014204508219288e-06} [Rank 1] Trainer log: {'loss': 0.9162, 'grad_norm': 2.7622604370117188, 'learning_rate': 1.5014204508219288e-06} [Rank 0] Trainer log: {'loss': 0.9162, 'grad_norm': 2.7622604370117188, 'learning_rate': 1.5014204508219288e-06} [Rank 2] Trainer log: {'loss': 0.9162, 'grad_norm': 2.7622604370117188, 'learning_rate': 1.5014204508219288e-06} {'loss': 0.9162, 'grad_norm': 2.7622604370117188, 'learning_rate': 1.5014204508219288e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.7828, 'grad_norm': 6.00980281829834, 'learning_rate': 1.4978286427038602e-06}[Rank 1] Trainer log: {'loss': 0.7828, 'grad_norm': 6.00980281829834, 'learning_rate': 1.4978286427038602e-06} [Rank 0] Trainer log: {'loss': 0.7828, 'grad_norm': 6.00980281829834, 'learning_rate': 1.4978286427038602e-06}[Rank 2] Trainer log: {'loss': 0.7828, 'grad_norm': 6.00980281829834, 'learning_rate': 1.4978286427038602e-06} {'loss': 0.7828, 'grad_norm': 6.00980281829834, 'learning_rate': 1.4978286427038602e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 1.0624, 'grad_norm': 7.930791854858398, 'learning_rate': 1.4942407881897015e-06} [Rank 1] Trainer log: {'loss': 1.0624, 'grad_norm': 7.930791854858398, 'learning_rate': 1.4942407881897015e-06} [Rank 0] Trainer log: {'loss': 1.0624, 'grad_norm': 7.930791854858398, 'learning_rate': 1.4942407881897015e-06}[Rank 2] Trainer log: {'loss': 1.0624, 'grad_norm': 7.930791854858398, 'learning_rate': 1.4942407881897015e-06} {'loss': 1.0624, 'grad_norm': 7.930791854858398, 'learning_rate': 1.4942407881897015e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.921, 'grad_norm': 4.227538108825684, 'learning_rate': 1.490656888947849e-06}[Rank 0] Trainer log: {'loss': 0.921, 'grad_norm': 4.227538108825684, 'learning_rate': 1.490656888947849e-06}[Rank 1] Trainer log: {'loss': 0.921, 'grad_norm': 4.227538108825684, 'learning_rate': 1.490656888947849e-06} [Rank 2] Trainer log: {'loss': 0.921, 'grad_norm': 4.227538108825684, 'learning_rate': 1.490656888947849e-06} {'loss': 0.921, 'grad_norm': 4.227538108825684, 'learning_rate': 1.490656888947849e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.7427, 'grad_norm': 4.787853240966797, 'learning_rate': 1.487076946644851e-06}[Rank 3] Trainer log: {'loss': 0.7427, 'grad_norm': 4.787853240966797, 'learning_rate': 1.487076946644851e-06} [Rank 0] Trainer log: {'loss': 0.7427, 'grad_norm': 4.787853240966797, 'learning_rate': 1.487076946644851e-06} [Rank 1] Trainer log: {'loss': 0.7427, 'grad_norm': 4.787853240966797, 'learning_rate': 1.487076946644851e-06} {'loss': 0.7427, 'grad_norm': 4.787853240966797, 'learning_rate': 1.487076946644851e-06, 'epoch': 0.83} [Rank 3] Trainer log: {'loss': 0.6585, 'grad_norm': 6.676933288574219, 'learning_rate': 1.4835009629454221e-06}[Rank 1] Trainer log: {'loss': 0.6585, 'grad_norm': 6.676933288574219, 'learning_rate': 1.4835009629454221e-06}[Rank 0] Trainer log: {'loss': 0.6585, 'grad_norm': 6.676933288574219, 'learning_rate': 1.4835009629454221e-06} [Rank 2] Trainer log: {'loss': 0.6585, 'grad_norm': 6.676933288574219, 'learning_rate': 1.4835009629454221e-06} {'loss': 0.6585, 'grad_norm': 6.676933288574219, 'learning_rate': 1.4835009629454221e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.8206, 'grad_norm': 2.1841177940368652, 'learning_rate': 1.4799289395124383e-06}[Rank 3] Trainer log: {'loss': 0.8206, 'grad_norm': 2.1841177940368652, 'learning_rate': 1.4799289395124383e-06}[Rank 1] Trainer log: {'loss': 0.8206, 'grad_norm': 2.1841177940368652, 'learning_rate': 1.4799289395124383e-06} [Rank 0] Trainer log: {'loss': 0.8206, 'grad_norm': 2.1841177940368652, 'learning_rate': 1.4799289395124383e-06} {'loss': 0.8206, 'grad_norm': 2.1841177940368652, 'learning_rate': 1.4799289395124383e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.9105, 'grad_norm': 3.033149480819702, 'learning_rate': 1.4763608780069261e-06}[Rank 1] Trainer log: {'loss': 0.9105, 'grad_norm': 3.033149480819702, 'learning_rate': 1.4763608780069261e-06}[Rank 3] Trainer log: {'loss': 0.9105, 'grad_norm': 3.033149480819702, 'learning_rate': 1.4763608780069261e-06} [Rank 0] Trainer log: {'loss': 0.9105, 'grad_norm': 3.033149480819702, 'learning_rate': 1.4763608780069261e-06} {'loss': 0.9105, 'grad_norm': 3.033149480819702, 'learning_rate': 1.4763608780069261e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.6104, 'grad_norm': 3.0075178146362305, 'learning_rate': 1.4727967800880727e-06} [Rank 2] Trainer log: {'loss': 0.6104, 'grad_norm': 3.0075178146362305, 'learning_rate': 1.4727967800880727e-06} [Rank 3] Trainer log: {'loss': 0.6104, 'grad_norm': 3.0075178146362305, 'learning_rate': 1.4727967800880727e-06}[Rank 0] Trainer log: {'loss': 0.6104, 'grad_norm': 3.0075178146362305, 'learning_rate': 1.4727967800880727e-06} {'loss': 0.6104, 'grad_norm': 3.0075178146362305, 'learning_rate': 1.4727967800880727e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.7037, 'grad_norm': 7.7090229988098145, 'learning_rate': 1.4692366474132268e-06}[Rank 1] Trainer log: {'loss': 0.7037, 'grad_norm': 7.7090229988098145, 'learning_rate': 1.4692366474132268e-06}[Rank 0] Trainer log: {'loss': 0.7037, 'grad_norm': 7.7090229988098145, 'learning_rate': 1.4692366474132268e-06} [Rank 3] Trainer log: {'loss': 0.7037, 'grad_norm': 7.7090229988098145, 'learning_rate': 1.4692366474132268e-06} {'loss': 0.7037, 'grad_norm': 7.7090229988098145, 'learning_rate': 1.4692366474132268e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.8254, 'grad_norm': 2.20866060256958, 'learning_rate': 1.4656804816378868e-06}[Rank 3] Trainer log: {'loss': 0.8254, 'grad_norm': 2.20866060256958, 'learning_rate': 1.4656804816378868e-06} [Rank 1] Trainer log: {'loss': 0.8254, 'grad_norm': 2.20866060256958, 'learning_rate': 1.4656804816378868e-06}[Rank 2] Trainer log: {'loss': 0.8254, 'grad_norm': 2.20866060256958, 'learning_rate': 1.4656804816378868e-06} {'loss': 0.8254, 'grad_norm': 2.20866060256958, 'learning_rate': 1.4656804816378868e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.923, 'grad_norm': 4.2560343742370605, 'learning_rate': 1.4621282844157082e-06}[Rank 0] Trainer log: {'loss': 0.923, 'grad_norm': 4.2560343742370605, 'learning_rate': 1.4621282844157082e-06}[Rank 1] Trainer log: {'loss': 0.923, 'grad_norm': 4.2560343742370605, 'learning_rate': 1.4621282844157082e-06} [Rank 3] Trainer log: {'loss': 0.923, 'grad_norm': 4.2560343742370605, 'learning_rate': 1.4621282844157082e-06} {'loss': 0.923, 'grad_norm': 4.2560343742370605, 'learning_rate': 1.4621282844157082e-06, 'epoch': 0.83} [Rank 1] Trainer log: {'loss': 0.652, 'grad_norm': 8.646852493286133, 'learning_rate': 1.4585800573985064e-06}[Rank 3] Trainer log: {'loss': 0.652, 'grad_norm': 8.646852493286133, 'learning_rate': 1.4585800573985064e-06}[Rank 2] Trainer log: {'loss': 0.652, 'grad_norm': 8.646852493286133, 'learning_rate': 1.4585800573985064e-06} [Rank 0] Trainer log: {'loss': 0.652, 'grad_norm': 8.646852493286133, 'learning_rate': 1.4585800573985064e-06} {'loss': 0.652, 'grad_norm': 8.646852493286133, 'learning_rate': 1.4585800573985064e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.7093, 'grad_norm': 7.5944976806640625, 'learning_rate': 1.455035802236241e-06}[Rank 3] Trainer log: {'loss': 0.7093, 'grad_norm': 7.5944976806640625, 'learning_rate': 1.455035802236241e-06} [Rank 0] Trainer log: {'loss': 0.7093, 'grad_norm': 7.5944976806640625, 'learning_rate': 1.455035802236241e-06} [Rank 1] Trainer log: {'loss': 0.7093, 'grad_norm': 7.5944976806640625, 'learning_rate': 1.455035802236241e-06} {'loss': 0.7093, 'grad_norm': 7.5944976806640625, 'learning_rate': 1.455035802236241e-06, 'epoch': 0.83} [Rank 0] Trainer log: {'loss': 0.9016, 'grad_norm': 4.447729110717773, 'learning_rate': 1.4514955205770332e-06}[Rank 1] Trainer log: {'loss': 0.9016, 'grad_norm': 4.447729110717773, 'learning_rate': 1.4514955205770332e-06}[Rank 2] Trainer log: {'loss': 0.9016, 'grad_norm': 4.447729110717773, 'learning_rate': 1.4514955205770332e-06} [Rank 3] Trainer log: {'loss': 0.9016, 'grad_norm': 4.447729110717773, 'learning_rate': 1.4514955205770332e-06} {'loss': 0.9016, 'grad_norm': 4.447729110717773, 'learning_rate': 1.4514955205770332e-06, 'epoch': 0.83} [Rank 2] Trainer log: {'loss': 0.6613, 'grad_norm': 7.730487823486328, 'learning_rate': 1.447959214067155e-06}[Rank 1] Trainer log: {'loss': 0.6613, 'grad_norm': 7.730487823486328, 'learning_rate': 1.447959214067155e-06} [Rank 3] Trainer log: {'loss': 0.6613, 'grad_norm': 7.730487823486328, 'learning_rate': 1.447959214067155e-06}[Rank 0] Trainer log: {'loss': 0.6613, 'grad_norm': 7.730487823486328, 'learning_rate': 1.447959214067155e-06} {'loss': 0.6613, 'grad_norm': 7.730487823486328, 'learning_rate': 1.447959214067155e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.9484, 'grad_norm': 3.716632604598999, 'learning_rate': 1.4444268843510257e-06}[Rank 3] Trainer log: {'loss': 0.9484, 'grad_norm': 3.716632604598999, 'learning_rate': 1.4444268843510257e-06}[Rank 1] Trainer log: {'loss': 0.9484, 'grad_norm': 3.716632604598999, 'learning_rate': 1.4444268843510257e-06} [Rank 0] Trainer log: {'loss': 0.9484, 'grad_norm': 3.716632604598999, 'learning_rate': 1.4444268843510257e-06} {'loss': 0.9484, 'grad_norm': 3.716632604598999, 'learning_rate': 1.4444268843510257e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 1.0223, 'grad_norm': 2.2118122577667236, 'learning_rate': 1.4408985330712165e-06}[Rank 1] Trainer log: {'loss': 1.0223, 'grad_norm': 2.2118122577667236, 'learning_rate': 1.4408985330712165e-06}[Rank 0] Trainer log: {'loss': 1.0223, 'grad_norm': 2.2118122577667236, 'learning_rate': 1.4408985330712165e-06} [Rank 2] Trainer log: {'loss': 1.0223, 'grad_norm': 2.2118122577667236, 'learning_rate': 1.4408985330712165e-06} {'loss': 1.0223, 'grad_norm': 2.2118122577667236, 'learning_rate': 1.4408985330712165e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.8841, 'grad_norm': 3.1382713317871094, 'learning_rate': 1.43737416186845e-06}[Rank 0] Trainer log: {'loss': 0.8841, 'grad_norm': 3.1382713317871094, 'learning_rate': 1.43737416186845e-06} [Rank 1] Trainer log: {'loss': 0.8841, 'grad_norm': 3.1382713317871094, 'learning_rate': 1.43737416186845e-06} [Rank 2] Trainer log: {'loss': 0.8841, 'grad_norm': 3.1382713317871094, 'learning_rate': 1.43737416186845e-06} {'loss': 0.8841, 'grad_norm': 3.1382713317871094, 'learning_rate': 1.43737416186845e-06, 'epoch': 0.84} [Rank 1] Trainer log: {'loss': 0.9692, 'grad_norm': 5.836212158203125, 'learning_rate': 1.4338537723816014e-06}[Rank 2] Trainer log: {'loss': 0.9692, 'grad_norm': 5.836212158203125, 'learning_rate': 1.4338537723816014e-06} [Rank 3] Trainer log: {'loss': 0.9692, 'grad_norm': 5.836212158203125, 'learning_rate': 1.4338537723816014e-06} [Rank 0] Trainer log: {'loss': 0.9692, 'grad_norm': 5.836212158203125, 'learning_rate': 1.4338537723816014e-06} {'loss': 0.9692, 'grad_norm': 5.836212158203125, 'learning_rate': 1.4338537723816014e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.8895, 'grad_norm': 5.646044731140137, 'learning_rate': 1.4303373662476882e-06}[Rank 0] Trainer log: {'loss': 0.8895, 'grad_norm': 5.646044731140137, 'learning_rate': 1.4303373662476882e-06} [Rank 1] Trainer log: {'loss': 0.8895, 'grad_norm': 5.646044731140137, 'learning_rate': 1.4303373662476882e-06} [Rank 2] Trainer log: {'loss': 0.8895, 'grad_norm': 5.646044731140137, 'learning_rate': 1.4303373662476882e-06} {'loss': 0.8895, 'grad_norm': 5.646044731140137, 'learning_rate': 1.4303373662476882e-06, 'epoch': 0.84} [Rank 0] Trainer log: {'loss': 0.8386, 'grad_norm': 8.110249519348145, 'learning_rate': 1.4268249451018767e-06}[Rank 1] Trainer log: {'loss': 0.8386, 'grad_norm': 8.110249519348145, 'learning_rate': 1.4268249451018767e-06} [Rank 2] Trainer log: {'loss': 0.8386, 'grad_norm': 8.110249519348145, 'learning_rate': 1.4268249451018767e-06} [Rank 3] Trainer log: {'loss': 0.8386, 'grad_norm': 8.110249519348145, 'learning_rate': 1.4268249451018767e-06} {'loss': 0.8386, 'grad_norm': 8.110249519348145, 'learning_rate': 1.4268249451018767e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.5461, 'grad_norm': 2.967996597290039, 'learning_rate': 1.4233165105774827e-06}[Rank 1] Trainer log: {'loss': 0.5461, 'grad_norm': 2.967996597290039, 'learning_rate': 1.4233165105774827e-06} [Rank 2] Trainer log: {'loss': 0.5461, 'grad_norm': 2.967996597290039, 'learning_rate': 1.4233165105774827e-06} [Rank 0] Trainer log: {'loss': 0.5461, 'grad_norm': 2.967996597290039, 'learning_rate': 1.4233165105774827e-06} {'loss': 0.5461, 'grad_norm': 2.967996597290039, 'learning_rate': 1.4233165105774827e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.7685, 'grad_norm': 2.8843421936035156, 'learning_rate': 1.419812064305971e-06}[Rank 0] Trainer log: {'loss': 0.7685, 'grad_norm': 2.8843421936035156, 'learning_rate': 1.419812064305971e-06}[Rank 1] Trainer log: {'loss': 0.7685, 'grad_norm': 2.8843421936035156, 'learning_rate': 1.419812064305971e-06} {'loss': 0.7685, 'grad_norm': 2.8843421936035156, 'learning_rate': 1.419812064305971e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.7685, 'grad_norm': 2.8843421936035156, 'learning_rate': 1.419812064305971e-06} [Rank 0] Trainer log: {'loss': 0.6133, 'grad_norm': 1.870994210243225, 'learning_rate': 1.4163116079169415e-06}[Rank 3] Trainer log: {'loss': 0.6133, 'grad_norm': 1.870994210243225, 'learning_rate': 1.4163116079169415e-06}[Rank 1] Trainer log: {'loss': 0.6133, 'grad_norm': 1.870994210243225, 'learning_rate': 1.4163116079169415e-06} [Rank 2] Trainer log: {'loss': 0.6133, 'grad_norm': 1.870994210243225, 'learning_rate': 1.4163116079169415e-06} {'loss': 0.6133, 'grad_norm': 1.870994210243225, 'learning_rate': 1.4163116079169415e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 1.0437, 'grad_norm': 2.1682708263397217, 'learning_rate': 1.412815143038152e-06} [Rank 0] Trainer log: {'loss': 1.0437, 'grad_norm': 2.1682708263397217, 'learning_rate': 1.412815143038152e-06}[Rank 2] Trainer log: {'loss': 1.0437, 'grad_norm': 2.1682708263397217, 'learning_rate': 1.412815143038152e-06} [Rank 1] Trainer log: {'loss': 1.0437, 'grad_norm': 2.1682708263397217, 'learning_rate': 1.412815143038152e-06} {'loss': 1.0437, 'grad_norm': 2.1682708263397217, 'learning_rate': 1.412815143038152e-06, 'epoch': 0.84} [Rank 1] Trainer log: {'loss': 0.7745, 'grad_norm': 6.435252666473389, 'learning_rate': 1.4093226712954933e-06}[Rank 3] Trainer log: {'loss': 0.7745, 'grad_norm': 6.435252666473389, 'learning_rate': 1.4093226712954933e-06} [Rank 0] Trainer log: {'loss': 0.7745, 'grad_norm': 6.435252666473389, 'learning_rate': 1.4093226712954933e-06}[Rank 2] Trainer log: {'loss': 0.7745, 'grad_norm': 6.435252666473389, 'learning_rate': 1.4093226712954933e-06} {'loss': 0.7745, 'grad_norm': 6.435252666473389, 'learning_rate': 1.4093226712954933e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.643, 'grad_norm': 3.676038980484009, 'learning_rate': 1.4058341943130027e-06} [Rank 1] Trainer log: {'loss': 0.643, 'grad_norm': 3.676038980484009, 'learning_rate': 1.4058341943130027e-06} [Rank 0] Trainer log: {'loss': 0.643, 'grad_norm': 3.676038980484009, 'learning_rate': 1.4058341943130027e-06}[Rank 2] Trainer log: {'loss': 0.643, 'grad_norm': 3.676038980484009, 'learning_rate': 1.4058341943130027e-06} {'loss': 0.643, 'grad_norm': 3.676038980484009, 'learning_rate': 1.4058341943130027e-06, 'epoch': 0.84} [Rank 0] Trainer log: {'loss': 0.6295, 'grad_norm': 2.192955732345581, 'learning_rate': 1.4023497137128628e-06}[Rank 3] Trainer log: {'loss': 0.6295, 'grad_norm': 2.192955732345581, 'learning_rate': 1.4023497137128628e-06} [Rank 2] Trainer log: {'loss': 0.6295, 'grad_norm': 2.192955732345581, 'learning_rate': 1.4023497137128628e-06} [Rank 1] Trainer log: {'loss': 0.6295, 'grad_norm': 2.192955732345581, 'learning_rate': 1.4023497137128628e-06} {'loss': 0.6295, 'grad_norm': 2.192955732345581, 'learning_rate': 1.4023497137128628e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.5771, 'grad_norm': 5.909188747406006, 'learning_rate': 1.3988692311153961e-06}[Rank 0] Trainer log: {'loss': 0.5771, 'grad_norm': 5.909188747406006, 'learning_rate': 1.3988692311153961e-06}[Rank 2] Trainer log: {'loss': 0.5771, 'grad_norm': 5.909188747406006, 'learning_rate': 1.3988692311153961e-06} [Rank 1] Trainer log: {'loss': 0.5771, 'grad_norm': 5.909188747406006, 'learning_rate': 1.3988692311153961e-06} {'loss': 0.5771, 'grad_norm': 5.909188747406006, 'learning_rate': 1.3988692311153961e-06, 'epoch': 0.84} [Rank 0] Trainer log: {'loss': 0.7456, 'grad_norm': 6.214536190032959, 'learning_rate': 1.3953927481390629e-06}[Rank 3] Trainer log: {'loss': 0.7456, 'grad_norm': 6.214536190032959, 'learning_rate': 1.3953927481390629e-06}[Rank 1] Trainer log: {'loss': 0.7456, 'grad_norm': 6.214536190032959, 'learning_rate': 1.3953927481390629e-06} [Rank 2] Trainer log: {'loss': 0.7456, 'grad_norm': 6.214536190032959, 'learning_rate': 1.3953927481390629e-06} {'loss': 0.7456, 'grad_norm': 6.214536190032959, 'learning_rate': 1.3953927481390629e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.7939, 'grad_norm': 8.912454605102539, 'learning_rate': 1.3919202664004672e-06}[Rank 3] Trainer log: {'loss': 0.7939, 'grad_norm': 8.912454605102539, 'learning_rate': 1.3919202664004672e-06} [Rank 0] Trainer log: {'loss': 0.7939, 'grad_norm': 8.912454605102539, 'learning_rate': 1.3919202664004672e-06}[Rank 1] Trainer log: {'loss': 0.7939, 'grad_norm': 8.912454605102539, 'learning_rate': 1.3919202664004672e-06} {'loss': 0.7939, 'grad_norm': 8.912454605102539, 'learning_rate': 1.3919202664004672e-06, 'epoch': 0.84} [Rank 0] Trainer log: {'loss': 0.7129, 'grad_norm': 5.9227190017700195, 'learning_rate': 1.3884517875143544e-06}[Rank 3] Trainer log: {'loss': 0.7129, 'grad_norm': 5.9227190017700195, 'learning_rate': 1.3884517875143544e-06} [Rank 1] Trainer log: {'loss': 0.7129, 'grad_norm': 5.9227190017700195, 'learning_rate': 1.3884517875143544e-06} [Rank 2] Trainer log: {'loss': 0.7129, 'grad_norm': 5.9227190017700195, 'learning_rate': 1.3884517875143544e-06} {'loss': 0.7129, 'grad_norm': 5.9227190017700195, 'learning_rate': 1.3884517875143544e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 1.0285, 'grad_norm': 1.6861259937286377, 'learning_rate': 1.3849873130936009e-06} [Rank 3] Trainer log: {'loss': 1.0285, 'grad_norm': 1.6861259937286377, 'learning_rate': 1.3849873130936009e-06} [Rank 0] Trainer log: {'loss': 1.0285, 'grad_norm': 1.6861259937286377, 'learning_rate': 1.3849873130936009e-06}[Rank 1] Trainer log: {'loss': 1.0285, 'grad_norm': 1.6861259937286377, 'learning_rate': 1.3849873130936009e-06} {'loss': 1.0285, 'grad_norm': 1.6861259937286377, 'learning_rate': 1.3849873130936009e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.8022, 'grad_norm': 5.280514717102051, 'learning_rate': 1.3815268447492303e-06}[Rank 0] Trainer log: {'loss': 0.8022, 'grad_norm': 5.280514717102051, 'learning_rate': 1.3815268447492303e-06}[Rank 1] Trainer log: {'loss': 0.8022, 'grad_norm': 5.280514717102051, 'learning_rate': 1.3815268447492303e-06} {'loss': 0.8022, 'grad_norm': 5.280514717102051, 'learning_rate': 1.3815268447492303e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.8022, 'grad_norm': 5.280514717102051, 'learning_rate': 1.3815268447492303e-06} [Rank 1] Trainer log: {'loss': 0.704, 'grad_norm': 3.385974645614624, 'learning_rate': 1.3780703840903976e-06} [Rank 0] Trainer log: {'loss': 0.704, 'grad_norm': 3.385974645614624, 'learning_rate': 1.3780703840903976e-06}[Rank 3] Trainer log: {'loss': 0.704, 'grad_norm': 3.385974645614624, 'learning_rate': 1.3780703840903976e-06} [Rank 2] Trainer log: {'loss': 0.704, 'grad_norm': 3.385974645614624, 'learning_rate': 1.3780703840903976e-06} {'loss': 0.704, 'grad_norm': 3.385974645614624, 'learning_rate': 1.3780703840903976e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.7621, 'grad_norm': 6.753222942352295, 'learning_rate': 1.3746179327243914e-06} [Rank 0] Trainer log: {'loss': 0.7621, 'grad_norm': 6.753222942352295, 'learning_rate': 1.3746179327243914e-06}[Rank 1] Trainer log: {'loss': 0.7621, 'grad_norm': 6.753222942352295, 'learning_rate': 1.3746179327243914e-06} [Rank 2] Trainer log: {'loss': 0.7621, 'grad_norm': 6.753222942352295, 'learning_rate': 1.3746179327243914e-06} {'loss': 0.7621, 'grad_norm': 6.753222942352295, 'learning_rate': 1.3746179327243914e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.5796, 'grad_norm': 17.67717170715332, 'learning_rate': 1.3711694922566454e-06} [Rank 2] Trainer log: {'loss': 0.5796, 'grad_norm': 17.67717170715332, 'learning_rate': 1.3711694922566454e-06} [Rank 0] Trainer log: {'loss': 0.5796, 'grad_norm': 17.67717170715332, 'learning_rate': 1.3711694922566454e-06} [Rank 1] Trainer log: {'loss': 0.5796, 'grad_norm': 17.67717170715332, 'learning_rate': 1.3711694922566454e-06} {'loss': 0.5796, 'grad_norm': 17.67717170715332, 'learning_rate': 1.3711694922566454e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 1.029, 'grad_norm': 3.095717191696167, 'learning_rate': 1.3677250642907203e-06}[Rank 0] Trainer log: {'loss': 1.029, 'grad_norm': 3.095717191696167, 'learning_rate': 1.3677250642907203e-06}[Rank 1] Trainer log: {'loss': 1.029, 'grad_norm': 3.095717191696167, 'learning_rate': 1.3677250642907203e-06} {'loss': 1.029, 'grad_norm': 3.095717191696167, 'learning_rate': 1.3677250642907203e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 1.029, 'grad_norm': 3.095717191696167, 'learning_rate': 1.3677250642907203e-06} [Rank 3] Trainer log: {'loss': 0.9015, 'grad_norm': 2.992948293685913, 'learning_rate': 1.3642846504283113e-06}[Rank 0] Trainer log: {'loss': 0.9015, 'grad_norm': 2.992948293685913, 'learning_rate': 1.3642846504283113e-06} [Rank 2] Trainer log: {'loss': 0.9015, 'grad_norm': 2.992948293685913, 'learning_rate': 1.3642846504283113e-06} [Rank 1] Trainer log: {'loss': 0.9015, 'grad_norm': 2.992948293685913, 'learning_rate': 1.3642846504283113e-06} {'loss': 0.9015, 'grad_norm': 2.992948293685913, 'learning_rate': 1.3642846504283113e-06, 'epoch': 0.84} [Rank 1] Trainer log: {'loss': 0.9125, 'grad_norm': 3.9714865684509277, 'learning_rate': 1.36084825226925e-06} [Rank 2] Trainer log: {'loss': 0.9125, 'grad_norm': 3.9714865684509277, 'learning_rate': 1.36084825226925e-06} [Rank 3] Trainer log: {'loss': 0.9125, 'grad_norm': 3.9714865684509277, 'learning_rate': 1.36084825226925e-06} [Rank 0] Trainer log: {'loss': 0.9125, 'grad_norm': 3.9714865684509277, 'learning_rate': 1.36084825226925e-06} {'loss': 0.9125, 'grad_norm': 3.9714865684509277, 'learning_rate': 1.36084825226925e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.9304, 'grad_norm': 5.939352512359619, 'learning_rate': 1.3574158714115026e-06} [Rank 0] Trainer log: {'loss': 0.9304, 'grad_norm': 5.939352512359619, 'learning_rate': 1.3574158714115026e-06}[Rank 1] Trainer log: {'loss': 0.9304, 'grad_norm': 5.939352512359619, 'learning_rate': 1.3574158714115026e-06} [Rank 2] Trainer log: {'loss': 0.9304, 'grad_norm': 5.939352512359619, 'learning_rate': 1.3574158714115026e-06} {'loss': 0.9304, 'grad_norm': 5.939352512359619, 'learning_rate': 1.3574158714115026e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.9366, 'grad_norm': 3.1616151332855225, 'learning_rate': 1.3539875094511613e-06} [Rank 0] Trainer log: {'loss': 0.9366, 'grad_norm': 3.1616151332855225, 'learning_rate': 1.3539875094511613e-06}[Rank 1] Trainer log: {'loss': 0.9366, 'grad_norm': 3.1616151332855225, 'learning_rate': 1.3539875094511613e-06} [Rank 3] Trainer log: {'loss': 0.9366, 'grad_norm': 3.1616151332855225, 'learning_rate': 1.3539875094511613e-06} {'loss': 0.9366, 'grad_norm': 3.1616151332855225, 'learning_rate': 1.3539875094511613e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.8003, 'grad_norm': 6.982831954956055, 'learning_rate': 1.3505631679824516e-06} [Rank 0] Trainer log: {'loss': 0.8003, 'grad_norm': 6.982831954956055, 'learning_rate': 1.3505631679824516e-06}[Rank 2] Trainer log: {'loss': 0.8003, 'grad_norm': 6.982831954956055, 'learning_rate': 1.3505631679824516e-06} [Rank 1] Trainer log: {'loss': 0.8003, 'grad_norm': 6.982831954956055, 'learning_rate': 1.3505631679824516e-06} {'loss': 0.8003, 'grad_norm': 6.982831954956055, 'learning_rate': 1.3505631679824516e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 1.0518, 'grad_norm': 2.4541237354278564, 'learning_rate': 1.3471428485977344e-06}[Rank 0] Trainer log: {'loss': 1.0518, 'grad_norm': 2.4541237354278564, 'learning_rate': 1.3471428485977344e-06} [Rank 1] Trainer log: {'loss': 1.0518, 'grad_norm': 2.4541237354278564, 'learning_rate': 1.3471428485977344e-06}[Rank 3] Trainer log: {'loss': 1.0518, 'grad_norm': 2.4541237354278564, 'learning_rate': 1.3471428485977344e-06} {'loss': 1.0518, 'grad_norm': 2.4541237354278564, 'learning_rate': 1.3471428485977344e-06, 'epoch': 0.84} [Rank 0] Trainer log: {'loss': 0.7592, 'grad_norm': 4.358904838562012, 'learning_rate': 1.3437265528874933e-06}[Rank 1] Trainer log: {'loss': 0.7592, 'grad_norm': 4.358904838562012, 'learning_rate': 1.3437265528874933e-06}[Rank 3] Trainer log: {'loss': 0.7592, 'grad_norm': 4.358904838562012, 'learning_rate': 1.3437265528874933e-06} [Rank 2] Trainer log: {'loss': 0.7592, 'grad_norm': 4.358904838562012, 'learning_rate': 1.3437265528874933e-06} {'loss': 0.7592, 'grad_norm': 4.358904838562012, 'learning_rate': 1.3437265528874933e-06, 'epoch': 0.84} [Rank 1] Trainer log: {'loss': 0.7551, 'grad_norm': 6.7462544441223145, 'learning_rate': 1.3403142824403415e-06} [Rank 3] Trainer log: {'loss': 0.7551, 'grad_norm': 6.7462544441223145, 'learning_rate': 1.3403142824403415e-06} [Rank 0] Trainer log: {'loss': 0.7551, 'grad_norm': 6.7462544441223145, 'learning_rate': 1.3403142824403415e-06}[Rank 2] Trainer log: {'loss': 0.7551, 'grad_norm': 6.7462544441223145, 'learning_rate': 1.3403142824403415e-06} {'loss': 0.7551, 'grad_norm': 6.7462544441223145, 'learning_rate': 1.3403142824403415e-06, 'epoch': 0.84} [Rank 1] Trainer log: {'loss': 0.8887, 'grad_norm': 6.386147975921631, 'learning_rate': 1.3369060388430266e-06}[Rank 3] Trainer log: {'loss': 0.8887, 'grad_norm': 6.386147975921631, 'learning_rate': 1.3369060388430266e-06} [Rank 0] Trainer log: {'loss': 0.8887, 'grad_norm': 6.386147975921631, 'learning_rate': 1.3369060388430266e-06} [Rank 2] Trainer log: {'loss': 0.8887, 'grad_norm': 6.386147975921631, 'learning_rate': 1.3369060388430266e-06} {'loss': 0.8887, 'grad_norm': 6.386147975921631, 'learning_rate': 1.3369060388430266e-06, 'epoch': 0.84} [Rank 1] Trainer log: {'loss': 0.9414, 'grad_norm': 6.757012367248535, 'learning_rate': 1.3335018236804164e-06}[Rank 3] Trainer log: {'loss': 0.9414, 'grad_norm': 6.757012367248535, 'learning_rate': 1.3335018236804164e-06} [Rank 2] Trainer log: {'loss': 0.9414, 'grad_norm': 6.757012367248535, 'learning_rate': 1.3335018236804164e-06} [Rank 0] Trainer log: {'loss': 0.9414, 'grad_norm': 6.757012367248535, 'learning_rate': 1.3335018236804164e-06} {'loss': 0.9414, 'grad_norm': 6.757012367248535, 'learning_rate': 1.3335018236804164e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.8509, 'grad_norm': 11.897677421569824, 'learning_rate': 1.3301016385355093e-06}[Rank 1] Trainer log: {'loss': 0.8509, 'grad_norm': 11.897677421569824, 'learning_rate': 1.3301016385355093e-06}[Rank 2] Trainer log: {'loss': 0.8509, 'grad_norm': 11.897677421569824, 'learning_rate': 1.3301016385355093e-06} [Rank 0] Trainer log: {'loss': 0.8509, 'grad_norm': 11.897677421569824, 'learning_rate': 1.3301016385355093e-06} {'loss': 0.8509, 'grad_norm': 11.897677421569824, 'learning_rate': 1.3301016385355093e-06, 'epoch': 0.84} [Rank 0] Trainer log: {'loss': 0.7117, 'grad_norm': 5.67860221862793, 'learning_rate': 1.3267054849894335e-06}[Rank 3] Trainer log: {'loss': 0.7117, 'grad_norm': 5.67860221862793, 'learning_rate': 1.3267054849894335e-06}[Rank 2] Trainer log: {'loss': 0.7117, 'grad_norm': 5.67860221862793, 'learning_rate': 1.3267054849894335e-06} [Rank 1] Trainer log: {'loss': 0.7117, 'grad_norm': 5.67860221862793, 'learning_rate': 1.3267054849894335e-06} {'loss': 0.7117, 'grad_norm': 5.67860221862793, 'learning_rate': 1.3267054849894335e-06, 'epoch': 0.84} [Rank 0] Trainer log: {'loss': 0.47, 'grad_norm': 5.0793046951293945, 'learning_rate': 1.3233133646214336e-06}[Rank 3] Trainer log: {'loss': 0.47, 'grad_norm': 5.0793046951293945, 'learning_rate': 1.3233133646214336e-06} [Rank 1] Trainer log: {'loss': 0.47, 'grad_norm': 5.0793046951293945, 'learning_rate': 1.3233133646214336e-06} [Rank 2] Trainer log: {'loss': 0.47, 'grad_norm': 5.0793046951293945, 'learning_rate': 1.3233133646214336e-06} {'loss': 0.47, 'grad_norm': 5.0793046951293945, 'learning_rate': 1.3233133646214336e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.8642, 'grad_norm': 6.744845867156982, 'learning_rate': 1.3199252790088824e-06}[Rank 0] Trainer log: {'loss': 0.8642, 'grad_norm': 6.744845867156982, 'learning_rate': 1.3199252790088824e-06}[Rank 3] Trainer log: {'loss': 0.8642, 'grad_norm': 6.744845867156982, 'learning_rate': 1.3199252790088824e-06} [Rank 1] Trainer log: {'loss': 0.8642, 'grad_norm': 6.744845867156982, 'learning_rate': 1.3199252790088824e-06} {'loss': 0.8642, 'grad_norm': 6.744845867156982, 'learning_rate': 1.3199252790088824e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.8526, 'grad_norm': 5.758810520172119, 'learning_rate': 1.3165412297272817e-06}[Rank 0] Trainer log: {'loss': 0.8526, 'grad_norm': 5.758810520172119, 'learning_rate': 1.3165412297272817e-06}[Rank 1] Trainer log: {'loss': 0.8526, 'grad_norm': 5.758810520172119, 'learning_rate': 1.3165412297272817e-06} [Rank 2] Trainer log: {'loss': 0.8526, 'grad_norm': 5.758810520172119, 'learning_rate': 1.3165412297272817e-06} {'loss': 0.8526, 'grad_norm': 5.758810520172119, 'learning_rate': 1.3165412297272817e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.9097, 'grad_norm': 4.039095878601074, 'learning_rate': 1.3131612183502485e-06} [Rank 1] Trainer log: {'loss': 0.9097, 'grad_norm': 4.039095878601074, 'learning_rate': 1.3131612183502485e-06} [Rank 0] Trainer log: {'loss': 0.9097, 'grad_norm': 4.039095878601074, 'learning_rate': 1.3131612183502485e-06}[Rank 2] Trainer log: {'loss': 0.9097, 'grad_norm': 4.039095878601074, 'learning_rate': 1.3131612183502485e-06} {'loss': 0.9097, 'grad_norm': 4.039095878601074, 'learning_rate': 1.3131612183502485e-06, 'epoch': 0.84} [Rank 1] Trainer log: {'loss': 0.8012, 'grad_norm': 3.071324586868286, 'learning_rate': 1.3097852464495242e-06}[Rank 0] Trainer log: {'loss': 0.8012, 'grad_norm': 3.071324586868286, 'learning_rate': 1.3097852464495242e-06}[Rank 3] Trainer log: {'loss': 0.8012, 'grad_norm': 3.071324586868286, 'learning_rate': 1.3097852464495242e-06} [Rank 2] Trainer log: {'loss': 0.8012, 'grad_norm': 3.071324586868286, 'learning_rate': 1.3097852464495242e-06} {'loss': 0.8012, 'grad_norm': 3.071324586868286, 'learning_rate': 1.3097852464495242e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.7934, 'grad_norm': 3.1020188331604004, 'learning_rate': 1.3064133155949765e-06}[Rank 1] Trainer log: {'loss': 0.7934, 'grad_norm': 3.1020188331604004, 'learning_rate': 1.3064133155949765e-06} [Rank 0] Trainer log: {'loss': 0.7934, 'grad_norm': 3.1020188331604004, 'learning_rate': 1.3064133155949765e-06} [Rank 2] Trainer log: {'loss': 0.7934, 'grad_norm': 3.1020188331604004, 'learning_rate': 1.3064133155949765e-06} {'loss': 0.7934, 'grad_norm': 3.1020188331604004, 'learning_rate': 1.3064133155949765e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.8924, 'grad_norm': 5.157384872436523, 'learning_rate': 1.3030454273545879e-06}[Rank 3] Trainer log: {'loss': 0.8924, 'grad_norm': 5.157384872436523, 'learning_rate': 1.3030454273545879e-06}[Rank 1] Trainer log: {'loss': 0.8924, 'grad_norm': 5.157384872436523, 'learning_rate': 1.3030454273545879e-06} [Rank 0] Trainer log: {'loss': 0.8924, 'grad_norm': 5.157384872436523, 'learning_rate': 1.3030454273545879e-06} {'loss': 0.8924, 'grad_norm': 5.157384872436523, 'learning_rate': 1.3030454273545879e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.902, 'grad_norm': 8.359630584716797, 'learning_rate': 1.2996815832944653e-06}[Rank 1] Trainer log: {'loss': 0.902, 'grad_norm': 8.359630584716797, 'learning_rate': 1.2996815832944653e-06} [Rank 3] Trainer log: {'loss': 0.902, 'grad_norm': 8.359630584716797, 'learning_rate': 1.2996815832944653e-06} [Rank 0] Trainer log: {'loss': 0.902, 'grad_norm': 8.359630584716797, 'learning_rate': 1.2996815832944653e-06} {'loss': 0.902, 'grad_norm': 8.359630584716797, 'learning_rate': 1.2996815832944653e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.7759, 'grad_norm': 5.88267707824707, 'learning_rate': 1.2963217849788357e-06}[Rank 1] Trainer log: {'loss': 0.7759, 'grad_norm': 5.88267707824707, 'learning_rate': 1.2963217849788357e-06}[Rank 0] Trainer log: {'loss': 0.7759, 'grad_norm': 5.88267707824707, 'learning_rate': 1.2963217849788357e-06} [Rank 2] Trainer log: {'loss': 0.7759, 'grad_norm': 5.88267707824707, 'learning_rate': 1.2963217849788357e-06} {'loss': 0.7759, 'grad_norm': 5.88267707824707, 'learning_rate': 1.2963217849788357e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.7885, 'grad_norm': 4.169021129608154, 'learning_rate': 1.29296603397004e-06}[Rank 2] Trainer log: {'loss': 0.7885, 'grad_norm': 4.169021129608154, 'learning_rate': 1.29296603397004e-06}[Rank 1] Trainer log: {'loss': 0.7885, 'grad_norm': 4.169021129608154, 'learning_rate': 1.29296603397004e-06} [Rank 0] Trainer log: {'loss': 0.7885, 'grad_norm': 4.169021129608154, 'learning_rate': 1.29296603397004e-06} {'loss': 0.7885, 'grad_norm': 4.169021129608154, 'learning_rate': 1.29296603397004e-06, 'epoch': 0.84} [Rank 2] Trainer log: {'loss': 0.9399, 'grad_norm': 7.567420482635498, 'learning_rate': 1.289614331828538e-06} [Rank 0] Trainer log: {'loss': 0.9399, 'grad_norm': 7.567420482635498, 'learning_rate': 1.289614331828538e-06}[Rank 3] Trainer log: {'loss': 0.9399, 'grad_norm': 7.567420482635498, 'learning_rate': 1.289614331828538e-06} [Rank 1] Trainer log: {'loss': 0.9399, 'grad_norm': 7.567420482635498, 'learning_rate': 1.289614331828538e-06} {'loss': 0.9399, 'grad_norm': 7.567420482635498, 'learning_rate': 1.289614331828538e-06, 'epoch': 0.84} [Rank 1] Trainer log: {'loss': 0.9877, 'grad_norm': 3.092144012451172, 'learning_rate': 1.286266680112912e-06}[Rank 3] Trainer log: {'loss': 0.9877, 'grad_norm': 3.092144012451172, 'learning_rate': 1.286266680112912e-06} [Rank 2] Trainer log: {'loss': 0.9877, 'grad_norm': 3.092144012451172, 'learning_rate': 1.286266680112912e-06} [Rank 0] Trainer log: {'loss': 0.9877, 'grad_norm': 3.092144012451172, 'learning_rate': 1.286266680112912e-06} {'loss': 0.9877, 'grad_norm': 3.092144012451172, 'learning_rate': 1.286266680112912e-06, 'epoch': 0.84} [Rank 3] Trainer log: {'loss': 0.7772, 'grad_norm': 5.4361724853515625, 'learning_rate': 1.2829230803798554e-06} [Rank 1] Trainer log: {'loss': 0.7772, 'grad_norm': 5.4361724853515625, 'learning_rate': 1.2829230803798554e-06} [Rank 2] Trainer log: {'loss': 0.7772, 'grad_norm': 5.4361724853515625, 'learning_rate': 1.2829230803798554e-06} [Rank 0] Trainer log: {'loss': 0.7772, 'grad_norm': 5.4361724853515625, 'learning_rate': 1.2829230803798554e-06} {'loss': 0.7772, 'grad_norm': 5.4361724853515625, 'learning_rate': 1.2829230803798554e-06, 'epoch': 0.85} [Rank 0] Trainer log: {'loss': 0.8443, 'grad_norm': 2.5807595252990723, 'learning_rate': 1.2795835341841767e-06}[Rank 1] Trainer log: {'loss': 0.8443, 'grad_norm': 2.5807595252990723, 'learning_rate': 1.2795835341841767e-06} [Rank 3] Trainer log: {'loss': 0.8443, 'grad_norm': 2.5807595252990723, 'learning_rate': 1.2795835341841767e-06} [Rank 2] Trainer log: {'loss': 0.8443, 'grad_norm': 2.5807595252990723, 'learning_rate': 1.2795835341841767e-06} {'loss': 0.8443, 'grad_norm': 2.5807595252990723, 'learning_rate': 1.2795835341841767e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.8838, 'grad_norm': 10.728957176208496, 'learning_rate': 1.276248043078806e-06}[Rank 2] Trainer log: {'loss': 0.8838, 'grad_norm': 10.728957176208496, 'learning_rate': 1.276248043078806e-06}[Rank 0] Trainer log: {'loss': 0.8838, 'grad_norm': 10.728957176208496, 'learning_rate': 1.276248043078806e-06} [Rank 1] Trainer log: {'loss': 0.8838, 'grad_norm': 10.728957176208496, 'learning_rate': 1.276248043078806e-06} {'loss': 0.8838, 'grad_norm': 10.728957176208496, 'learning_rate': 1.276248043078806e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.8099, 'grad_norm': 6.73399543762207, 'learning_rate': 1.2729166086147803e-06}[Rank 1] Trainer log: {'loss': 0.8099, 'grad_norm': 6.73399543762207, 'learning_rate': 1.2729166086147803e-06}[Rank 0] Trainer log: {'loss': 0.8099, 'grad_norm': 6.73399543762207, 'learning_rate': 1.2729166086147803e-06} [Rank 2] Trainer log: {'loss': 0.8099, 'grad_norm': 6.73399543762207, 'learning_rate': 1.2729166086147803e-06} {'loss': 0.8099, 'grad_norm': 6.73399543762207, 'learning_rate': 1.2729166086147803e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.9413, 'grad_norm': 7.313668251037598, 'learning_rate': 1.2695892323412552e-06} [Rank 1] Trainer log: {'loss': 0.9413, 'grad_norm': 7.313668251037598, 'learning_rate': 1.2695892323412552e-06}[Rank 0] Trainer log: {'loss': 0.9413, 'grad_norm': 7.313668251037598, 'learning_rate': 1.2695892323412552e-06} [Rank 2] Trainer log: {'loss': 0.9413, 'grad_norm': 7.313668251037598, 'learning_rate': 1.2695892323412552e-06} {'loss': 0.9413, 'grad_norm': 7.313668251037598, 'learning_rate': 1.2695892323412552e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.7148, 'grad_norm': 12.130159378051758, 'learning_rate': 1.266265915805499e-06} [Rank 1] Trainer log: {'loss': 0.7148, 'grad_norm': 12.130159378051758, 'learning_rate': 1.266265915805499e-06} [Rank 2] Trainer log: {'loss': 0.7148, 'grad_norm': 12.130159378051758, 'learning_rate': 1.266265915805499e-06} [Rank 0] Trainer log: {'loss': 0.7148, 'grad_norm': 12.130159378051758, 'learning_rate': 1.266265915805499e-06} {'loss': 0.7148, 'grad_norm': 12.130159378051758, 'learning_rate': 1.266265915805499e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.605, 'grad_norm': 2.3057913780212402, 'learning_rate': 1.2629466605528894e-06} [Rank 1] Trainer log: {'loss': 0.605, 'grad_norm': 2.3057913780212402, 'learning_rate': 1.2629466605528894e-06}[Rank 0] Trainer log: {'loss': 0.605, 'grad_norm': 2.3057913780212402, 'learning_rate': 1.2629466605528894e-06} [Rank 2] Trainer log: {'loss': 0.605, 'grad_norm': 2.3057913780212402, 'learning_rate': 1.2629466605528894e-06} {'loss': 0.605, 'grad_norm': 2.3057913780212402, 'learning_rate': 1.2629466605528894e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.8917, 'grad_norm': 3.420635461807251, 'learning_rate': 1.2596314681269173e-06}[Rank 1] Trainer log: {'loss': 0.8917, 'grad_norm': 3.420635461807251, 'learning_rate': 1.2596314681269173e-06} [Rank 0] Trainer log: {'loss': 0.8917, 'grad_norm': 3.420635461807251, 'learning_rate': 1.2596314681269173e-06} [Rank 2] Trainer log: {'loss': 0.8917, 'grad_norm': 3.420635461807251, 'learning_rate': 1.2596314681269173e-06} {'loss': 0.8917, 'grad_norm': 3.420635461807251, 'learning_rate': 1.2596314681269173e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.9674, 'grad_norm': 15.801322937011719, 'learning_rate': 1.2563203400691804e-06}[Rank 0] Trainer log: {'loss': 0.9674, 'grad_norm': 15.801322937011719, 'learning_rate': 1.2563203400691804e-06}[Rank 1] Trainer log: {'loss': 0.9674, 'grad_norm': 15.801322937011719, 'learning_rate': 1.2563203400691804e-06} [Rank 2] Trainer log: {'loss': 0.9674, 'grad_norm': 15.801322937011719, 'learning_rate': 1.2563203400691804e-06} {'loss': 0.9674, 'grad_norm': 15.801322937011719, 'learning_rate': 1.2563203400691804e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 1.0435, 'grad_norm': 3.4900214672088623, 'learning_rate': 1.253013277919396e-06}[Rank 2] Trainer log: {'loss': 1.0435, 'grad_norm': 3.4900214672088623, 'learning_rate': 1.253013277919396e-06} [Rank 0] Trainer log: {'loss': 1.0435, 'grad_norm': 3.4900214672088623, 'learning_rate': 1.253013277919396e-06}[Rank 1] Trainer log: {'loss': 1.0435, 'grad_norm': 3.4900214672088623, 'learning_rate': 1.253013277919396e-06} {'loss': 1.0435, 'grad_norm': 3.4900214672088623, 'learning_rate': 1.253013277919396e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.607, 'grad_norm': 2.8236777782440186, 'learning_rate': 1.2497102832153785e-06}[Rank 0] Trainer log: {'loss': 0.607, 'grad_norm': 2.8236777782440186, 'learning_rate': 1.2497102832153785e-06} [Rank 1] Trainer log: {'loss': 0.607, 'grad_norm': 2.8236777782440186, 'learning_rate': 1.2497102832153785e-06} [Rank 2] Trainer log: {'loss': 0.607, 'grad_norm': 2.8236777782440186, 'learning_rate': 1.2497102832153785e-06} {'loss': 0.607, 'grad_norm': 2.8236777782440186, 'learning_rate': 1.2497102832153785e-06, 'epoch': 0.85} [Rank 1] Trainer log: {'loss': 0.6479, 'grad_norm': 6.176882266998291, 'learning_rate': 1.2464113574930626e-06} [Rank 3] Trainer log: {'loss': 0.6479, 'grad_norm': 6.176882266998291, 'learning_rate': 1.2464113574930626e-06} [Rank 0] Trainer log: {'loss': 0.6479, 'grad_norm': 6.176882266998291, 'learning_rate': 1.2464113574930626e-06}[Rank 2] Trainer log: {'loss': 0.6479, 'grad_norm': 6.176882266998291, 'learning_rate': 1.2464113574930626e-06} {'loss': 0.6479, 'grad_norm': 6.176882266998291, 'learning_rate': 1.2464113574930626e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.9894, 'grad_norm': 3.2460434436798096, 'learning_rate': 1.24311650228648e-06}[Rank 0] Trainer log: {'loss': 0.9894, 'grad_norm': 3.2460434436798096, 'learning_rate': 1.24311650228648e-06} [Rank 1] Trainer log: {'loss': 0.9894, 'grad_norm': 3.2460434436798096, 'learning_rate': 1.24311650228648e-06} [Rank 2] Trainer log: {'loss': 0.9894, 'grad_norm': 3.2460434436798096, 'learning_rate': 1.24311650228648e-06} {'loss': 0.9894, 'grad_norm': 3.2460434436798096, 'learning_rate': 1.24311650228648e-06, 'epoch': 0.85} [Rank 0] Trainer log: {'loss': 0.7161, 'grad_norm': 15.67260456085205, 'learning_rate': 1.2398257191277796e-06}[Rank 3] Trainer log: {'loss': 0.7161, 'grad_norm': 15.67260456085205, 'learning_rate': 1.2398257191277796e-06} [Rank 2] Trainer log: {'loss': 0.7161, 'grad_norm': 15.67260456085205, 'learning_rate': 1.2398257191277796e-06} [Rank 1] Trainer log: {'loss': 0.7161, 'grad_norm': 15.67260456085205, 'learning_rate': 1.2398257191277796e-06} {'loss': 0.7161, 'grad_norm': 15.67260456085205, 'learning_rate': 1.2398257191277796e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.9553, 'grad_norm': 7.140649795532227, 'learning_rate': 1.2365390095472086e-06} [Rank 1] Trainer log: {'loss': 0.9553, 'grad_norm': 7.140649795532227, 'learning_rate': 1.2365390095472086e-06}[Rank 0] Trainer log: {'loss': 0.9553, 'grad_norm': 7.140649795532227, 'learning_rate': 1.2365390095472086e-06} [Rank 2] Trainer log: {'loss': 0.9553, 'grad_norm': 7.140649795532227, 'learning_rate': 1.2365390095472086e-06} {'loss': 0.9553, 'grad_norm': 7.140649795532227, 'learning_rate': 1.2365390095472086e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.3914, 'grad_norm': 8.332245826721191, 'learning_rate': 1.2332563750731264e-06} [Rank 1] Trainer log: {'loss': 0.3914, 'grad_norm': 8.332245826721191, 'learning_rate': 1.2332563750731264e-06}[Rank 0] Trainer log: {'loss': 0.3914, 'grad_norm': 8.332245826721191, 'learning_rate': 1.2332563750731264e-06} [Rank 2] Trainer log: {'loss': 0.3914, 'grad_norm': 8.332245826721191, 'learning_rate': 1.2332563750731264e-06} {'loss': 0.3914, 'grad_norm': 8.332245826721191, 'learning_rate': 1.2332563750731264e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.8399, 'grad_norm': 3.1761837005615234, 'learning_rate': 1.2299778172319943e-06} [Rank 1] Trainer log: {'loss': 0.8399, 'grad_norm': 3.1761837005615234, 'learning_rate': 1.2299778172319943e-06} [Rank 0] Trainer log: {'loss': 0.8399, 'grad_norm': 3.1761837005615234, 'learning_rate': 1.2299778172319943e-06} [Rank 2] Trainer log: {'loss': 0.8399, 'grad_norm': 3.1761837005615234, 'learning_rate': 1.2299778172319943e-06} {'loss': 0.8399, 'grad_norm': 3.1761837005615234, 'learning_rate': 1.2299778172319943e-06, 'epoch': 0.85} [Rank 1] Trainer log: {'loss': 0.7534, 'grad_norm': 7.273342132568359, 'learning_rate': 1.2267033375483727e-06}[Rank 0] Trainer log: {'loss': 0.7534, 'grad_norm': 7.273342132568359, 'learning_rate': 1.2267033375483727e-06}[Rank 3] Trainer log: {'loss': 0.7534, 'grad_norm': 7.273342132568359, 'learning_rate': 1.2267033375483727e-06} [Rank 2] Trainer log: {'loss': 0.7534, 'grad_norm': 7.273342132568359, 'learning_rate': 1.2267033375483727e-06} {'loss': 0.7534, 'grad_norm': 7.273342132568359, 'learning_rate': 1.2267033375483727e-06, 'epoch': 0.85} [Rank 2] Trainer log: {'loss': 0.8425, 'grad_norm': 4.868124008178711, 'learning_rate': 1.2234329375449384e-06}[Rank 3] Trainer log: {'loss': 0.8425, 'grad_norm': 4.868124008178711, 'learning_rate': 1.2234329375449384e-06} [Rank 1] Trainer log: {'loss': 0.8425, 'grad_norm': 4.868124008178711, 'learning_rate': 1.2234329375449384e-06} [Rank 0] Trainer log: {'loss': 0.8425, 'grad_norm': 4.868124008178711, 'learning_rate': 1.2234329375449384e-06} {'loss': 0.8425, 'grad_norm': 4.868124008178711, 'learning_rate': 1.2234329375449384e-06, 'epoch': 0.85} [Rank 2] Trainer log: {'loss': 0.565, 'grad_norm': 7.409879207611084, 'learning_rate': 1.2201666187424566e-06} [Rank 3] Trainer log: {'loss': 0.565, 'grad_norm': 7.409879207611084, 'learning_rate': 1.2201666187424566e-06} [Rank 0] Trainer log: {'loss': 0.565, 'grad_norm': 7.409879207611084, 'learning_rate': 1.2201666187424566e-06}[Rank 1] Trainer log: {'loss': 0.565, 'grad_norm': 7.409879207611084, 'learning_rate': 1.2201666187424566e-06} {'loss': 0.565, 'grad_norm': 7.409879207611084, 'learning_rate': 1.2201666187424566e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 1.0322, 'grad_norm': 5.147290229797363, 'learning_rate': 1.216904382659806e-06}[Rank 2] Trainer log: {'loss': 1.0322, 'grad_norm': 5.147290229797363, 'learning_rate': 1.216904382659806e-06}[Rank 0] Trainer log: {'loss': 1.0322, 'grad_norm': 5.147290229797363, 'learning_rate': 1.216904382659806e-06} [Rank 1] Trainer log: {'loss': 1.0322, 'grad_norm': 5.147290229797363, 'learning_rate': 1.216904382659806e-06} {'loss': 1.0322, 'grad_norm': 5.147290229797363, 'learning_rate': 1.216904382659806e-06, 'epoch': 0.85} [Rank 1] Trainer log: {'loss': 1.1045, 'grad_norm': 4.461070537567139, 'learning_rate': 1.213646230813964e-06}[Rank 3] Trainer log: {'loss': 1.1045, 'grad_norm': 4.461070537567139, 'learning_rate': 1.213646230813964e-06} [Rank 2] Trainer log: {'loss': 1.1045, 'grad_norm': 4.461070537567139, 'learning_rate': 1.213646230813964e-06} [Rank 0] Trainer log: {'loss': 1.1045, 'grad_norm': 4.461070537567139, 'learning_rate': 1.213646230813964e-06} {'loss': 1.1045, 'grad_norm': 4.461070537567139, 'learning_rate': 1.213646230813964e-06, 'epoch': 0.85} [Rank 1] Trainer log: {'loss': 0.863, 'grad_norm': 4.844371795654297, 'learning_rate': 1.2103921647200056e-06}[Rank 3] Trainer log: {'loss': 0.863, 'grad_norm': 4.844371795654297, 'learning_rate': 1.2103921647200056e-06}[Rank 2] Trainer log: {'loss': 0.863, 'grad_norm': 4.844371795654297, 'learning_rate': 1.2103921647200056e-06} [Rank 0] Trainer log: {'loss': 0.863, 'grad_norm': 4.844371795654297, 'learning_rate': 1.2103921647200056e-06} {'loss': 0.863, 'grad_norm': 4.844371795654297, 'learning_rate': 1.2103921647200056e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 1.0519, 'grad_norm': 3.6758134365081787, 'learning_rate': 1.2071421858911037e-06}[Rank 0] Trainer log: {'loss': 1.0519, 'grad_norm': 3.6758134365081787, 'learning_rate': 1.2071421858911037e-06}[Rank 1] Trainer log: {'loss': 1.0519, 'grad_norm': 3.6758134365081787, 'learning_rate': 1.2071421858911037e-06} [Rank 2] Trainer log: {'loss': 1.0519, 'grad_norm': 3.6758134365081787, 'learning_rate': 1.2071421858911037e-06} {'loss': 1.0519, 'grad_norm': 3.6758134365081787, 'learning_rate': 1.2071421858911037e-06, 'epoch': 0.85} [Rank 2] Trainer log: {'loss': 0.6548, 'grad_norm': 21.68583106994629, 'learning_rate': 1.2038962958385415e-06}[Rank 3] Trainer log: {'loss': 0.6548, 'grad_norm': 21.68583106994629, 'learning_rate': 1.2038962958385415e-06} [Rank 1] Trainer log: {'loss': 0.6548, 'grad_norm': 21.68583106994629, 'learning_rate': 1.2038962958385415e-06} [Rank 0] Trainer log: {'loss': 0.6548, 'grad_norm': 21.68583106994629, 'learning_rate': 1.2038962958385415e-06} {'loss': 0.6548, 'grad_norm': 21.68583106994629, 'learning_rate': 1.2038962958385415e-06, 'epoch': 0.85} [Rank 2] Trainer log: {'loss': 0.9794, 'grad_norm': 4.261011600494385, 'learning_rate': 1.2006544960716894e-06} [Rank 0] Trainer log: {'loss': 0.9794, 'grad_norm': 4.261011600494385, 'learning_rate': 1.2006544960716894e-06}[Rank 3] Trainer log: {'loss': 0.9794, 'grad_norm': 4.261011600494385, 'learning_rate': 1.2006544960716894e-06} [Rank 1] Trainer log: {'loss': 0.9794, 'grad_norm': 4.261011600494385, 'learning_rate': 1.2006544960716894e-06} {'loss': 0.9794, 'grad_norm': 4.261011600494385, 'learning_rate': 1.2006544960716894e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.8457, 'grad_norm': 5.930541038513184, 'learning_rate': 1.197416788098018e-06} [Rank 0] Trainer log: {'loss': 0.8457, 'grad_norm': 5.930541038513184, 'learning_rate': 1.197416788098018e-06}[Rank 1] Trainer log: {'loss': 0.8457, 'grad_norm': 5.930541038513184, 'learning_rate': 1.197416788098018e-06} [Rank 2] Trainer log: {'loss': 0.8457, 'grad_norm': 5.930541038513184, 'learning_rate': 1.197416788098018e-06} {'loss': 0.8457, 'grad_norm': 5.930541038513184, 'learning_rate': 1.197416788098018e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.9886, 'grad_norm': 2.679431676864624, 'learning_rate': 1.1941831734231035e-06}[Rank 2] Trainer log: {'loss': 0.9886, 'grad_norm': 2.679431676864624, 'learning_rate': 1.1941831734231035e-06}[Rank 0] Trainer log: {'loss': 0.9886, 'grad_norm': 2.679431676864624, 'learning_rate': 1.1941831734231035e-06}[Rank 1] Trainer log: {'loss': 0.9886, 'grad_norm': 2.679431676864624, 'learning_rate': 1.1941831734231035e-06} {'loss': 0.9886, 'grad_norm': 2.679431676864624, 'learning_rate': 1.1941831734231035e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 1.0463, 'grad_norm': 5.194766521453857, 'learning_rate': 1.1909536535506072e-06}[Rank 0] Trainer log: {'loss': 1.0463, 'grad_norm': 5.194766521453857, 'learning_rate': 1.1909536535506072e-06}[Rank 2] Trainer log: {'loss': 1.0463, 'grad_norm': 5.194766521453857, 'learning_rate': 1.1909536535506072e-06} [Rank 1] Trainer log: {'loss': 1.0463, 'grad_norm': 5.194766521453857, 'learning_rate': 1.1909536535506072e-06} {'loss': 1.0463, 'grad_norm': 5.194766521453857, 'learning_rate': 1.1909536535506072e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 1.0117, 'grad_norm': 4.942829132080078, 'learning_rate': 1.1877282299822945e-06} [Rank 1] Trainer log: {'loss': 1.0117, 'grad_norm': 4.942829132080078, 'learning_rate': 1.1877282299822945e-06}[Rank 2] Trainer log: {'loss': 1.0117, 'grad_norm': 4.942829132080078, 'learning_rate': 1.1877282299822945e-06} [Rank 0] Trainer log: {'loss': 1.0117, 'grad_norm': 4.942829132080078, 'learning_rate': 1.1877282299822945e-06} {'loss': 1.0117, 'grad_norm': 4.942829132080078, 'learning_rate': 1.1877282299822945e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.8003, 'grad_norm': 6.4068427085876465, 'learning_rate': 1.1845069042180247e-06} [Rank 0] Trainer log: {'loss': 0.8003, 'grad_norm': 6.4068427085876465, 'learning_rate': 1.1845069042180247e-06} [Rank 1] Trainer log: {'loss': 0.8003, 'grad_norm': 6.4068427085876465, 'learning_rate': 1.1845069042180247e-06} [Rank 2] Trainer log: {'loss': 0.8003, 'grad_norm': 6.4068427085876465, 'learning_rate': 1.1845069042180247e-06} {'loss': 0.8003, 'grad_norm': 6.4068427085876465, 'learning_rate': 1.1845069042180247e-06, 'epoch': 0.85} [Rank 1] Trainer log: {'loss': 1.0261, 'grad_norm': 2.0635573863983154, 'learning_rate': 1.1812896777557469e-06}[Rank 3] Trainer log: {'loss': 1.0261, 'grad_norm': 2.0635573863983154, 'learning_rate': 1.1812896777557469e-06}[Rank 0] Trainer log: {'loss': 1.0261, 'grad_norm': 2.0635573863983154, 'learning_rate': 1.1812896777557469e-06} [Rank 2] Trainer log: {'loss': 1.0261, 'grad_norm': 2.0635573863983154, 'learning_rate': 1.1812896777557469e-06} {'loss': 1.0261, 'grad_norm': 2.0635573863983154, 'learning_rate': 1.1812896777557469e-06, 'epoch': 0.85} [Rank 1] Trainer log: {'loss': 1.0065, 'grad_norm': 2.7957475185394287, 'learning_rate': 1.1780765520915082e-06} [Rank 3] Trainer log: {'loss': 1.0065, 'grad_norm': 2.7957475185394287, 'learning_rate': 1.1780765520915082e-06} [Rank 0] Trainer log: {'loss': 1.0065, 'grad_norm': 2.7957475185394287, 'learning_rate': 1.1780765520915082e-06}[Rank 2] Trainer log: {'loss': 1.0065, 'grad_norm': 2.7957475185394287, 'learning_rate': 1.1780765520915082e-06} {'loss': 1.0065, 'grad_norm': 2.7957475185394287, 'learning_rate': 1.1780765520915082e-06, 'epoch': 0.85} [Rank 0] Trainer log: {'loss': 0.7752, 'grad_norm': 5.814560413360596, 'learning_rate': 1.1748675287194456e-06}[Rank 3] Trainer log: {'loss': 0.7752, 'grad_norm': 5.814560413360596, 'learning_rate': 1.1748675287194456e-06} [Rank 2] Trainer log: {'loss': 0.7752, 'grad_norm': 5.814560413360596, 'learning_rate': 1.1748675287194456e-06} [Rank 1] Trainer log: {'loss': 0.7752, 'grad_norm': 5.814560413360596, 'learning_rate': 1.1748675287194456e-06} {'loss': 0.7752, 'grad_norm': 5.814560413360596, 'learning_rate': 1.1748675287194456e-06, 'epoch': 0.85} [Rank 0] Trainer log: {'loss': 0.6958, 'grad_norm': 7.740983009338379, 'learning_rate': 1.171662609131795e-06}[Rank 1] Trainer log: {'loss': 0.6958, 'grad_norm': 7.740983009338379, 'learning_rate': 1.171662609131795e-06} [Rank 3] Trainer log: {'loss': 0.6958, 'grad_norm': 7.740983009338379, 'learning_rate': 1.171662609131795e-06} [Rank 2] Trainer log: {'loss': 0.6958, 'grad_norm': 7.740983009338379, 'learning_rate': 1.171662609131795e-06} {'loss': 0.6958, 'grad_norm': 7.740983009338379, 'learning_rate': 1.171662609131795e-06, 'epoch': 0.85} [Rank 0] Trainer log: {'loss': 0.7749, 'grad_norm': 9.390199661254883, 'learning_rate': 1.168461794818876e-06}[Rank 3] Trainer log: {'loss': 0.7749, 'grad_norm': 9.390199661254883, 'learning_rate': 1.168461794818876e-06}[Rank 1] Trainer log: {'loss': 0.7749, 'grad_norm': 9.390199661254883, 'learning_rate': 1.168461794818876e-06} [Rank 2] Trainer log: {'loss': 0.7749, 'grad_norm': 9.390199661254883, 'learning_rate': 1.168461794818876e-06} {'loss': 0.7749, 'grad_norm': 9.390199661254883, 'learning_rate': 1.168461794818876e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.9675, 'grad_norm': 4.080273628234863, 'learning_rate': 1.165265087269102e-06}[Rank 1] Trainer log: {'loss': 0.9675, 'grad_norm': 4.080273628234863, 'learning_rate': 1.165265087269102e-06} [Rank 0] Trainer log: {'loss': 0.9675, 'grad_norm': 4.080273628234863, 'learning_rate': 1.165265087269102e-06}[Rank 2] Trainer log: {'loss': 0.9675, 'grad_norm': 4.080273628234863, 'learning_rate': 1.165265087269102e-06} {'loss': 0.9675, 'grad_norm': 4.080273628234863, 'learning_rate': 1.165265087269102e-06, 'epoch': 0.85} [Rank 0] Trainer log: {'loss': 0.6585, 'grad_norm': 6.530205726623535, 'learning_rate': 1.1620724879689793e-06}[Rank 3] Trainer log: {'loss': 0.6585, 'grad_norm': 6.530205726623535, 'learning_rate': 1.1620724879689793e-06}[Rank 1] Trainer log: {'loss': 0.6585, 'grad_norm': 6.530205726623535, 'learning_rate': 1.1620724879689793e-06} [Rank 2] Trainer log: {'loss': 0.6585, 'grad_norm': 6.530205726623535, 'learning_rate': 1.1620724879689793e-06} {'loss': 0.6585, 'grad_norm': 6.530205726623535, 'learning_rate': 1.1620724879689793e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.7065, 'grad_norm': 3.5721030235290527, 'learning_rate': 1.158883998403103e-06}[Rank 0] Trainer log: {'loss': 0.7065, 'grad_norm': 3.5721030235290527, 'learning_rate': 1.158883998403103e-06}[Rank 1] Trainer log: {'loss': 0.7065, 'grad_norm': 3.5721030235290527, 'learning_rate': 1.158883998403103e-06} [Rank 2] Trainer log: {'loss': 0.7065, 'grad_norm': 3.5721030235290527, 'learning_rate': 1.158883998403103e-06} {'loss': 0.7065, 'grad_norm': 3.5721030235290527, 'learning_rate': 1.158883998403103e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.6442, 'grad_norm': 7.477909088134766, 'learning_rate': 1.1556996200541526e-06}[Rank 1] Trainer log: {'loss': 0.6442, 'grad_norm': 7.477909088134766, 'learning_rate': 1.1556996200541526e-06} [Rank 2] Trainer log: {'loss': 0.6442, 'grad_norm': 7.477909088134766, 'learning_rate': 1.1556996200541526e-06} [Rank 0] Trainer log: {'loss': 0.6442, 'grad_norm': 7.477909088134766, 'learning_rate': 1.1556996200541526e-06} {'loss': 0.6442, 'grad_norm': 7.477909088134766, 'learning_rate': 1.1556996200541526e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.7548, 'grad_norm': 4.424305438995361, 'learning_rate': 1.1525193544029044e-06}[Rank 2] Trainer log: {'loss': 0.7548, 'grad_norm': 4.424305438995361, 'learning_rate': 1.1525193544029044e-06}[Rank 0] Trainer log: {'loss': 0.7548, 'grad_norm': 4.424305438995361, 'learning_rate': 1.1525193544029044e-06} [Rank 1] Trainer log: {'loss': 0.7548, 'grad_norm': 4.424305438995361, 'learning_rate': 1.1525193544029044e-06} {'loss': 0.7548, 'grad_norm': 4.424305438995361, 'learning_rate': 1.1525193544029044e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 1.0897, 'grad_norm': 3.3281404972076416, 'learning_rate': 1.1493432029282149e-06}[Rank 1] Trainer log: {'loss': 1.0897, 'grad_norm': 3.3281404972076416, 'learning_rate': 1.1493432029282149e-06} [Rank 0] Trainer log: {'loss': 1.0897, 'grad_norm': 3.3281404972076416, 'learning_rate': 1.1493432029282149e-06}[Rank 2] Trainer log: {'loss': 1.0897, 'grad_norm': 3.3281404972076416, 'learning_rate': 1.1493432029282149e-06} {'loss': 1.0897, 'grad_norm': 3.3281404972076416, 'learning_rate': 1.1493432029282149e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.8465, 'grad_norm': 9.381503105163574, 'learning_rate': 1.1461711671070252e-06}[Rank 1] Trainer log: {'loss': 0.8465, 'grad_norm': 9.381503105163574, 'learning_rate': 1.1461711671070252e-06} [Rank 2] Trainer log: {'loss': 0.8465, 'grad_norm': 9.381503105163574, 'learning_rate': 1.1461711671070252e-06} [Rank 0] Trainer log: {'loss': 0.8465, 'grad_norm': 9.381503105163574, 'learning_rate': 1.1461711671070252e-06} {'loss': 0.8465, 'grad_norm': 9.381503105163574, 'learning_rate': 1.1461711671070252e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.5399, 'grad_norm': 6.091038703918457, 'learning_rate': 1.143003248414375e-06}[Rank 0] Trainer log: {'loss': 0.5399, 'grad_norm': 6.091038703918457, 'learning_rate': 1.143003248414375e-06}[Rank 2] Trainer log: {'loss': 0.5399, 'grad_norm': 6.091038703918457, 'learning_rate': 1.143003248414375e-06} [Rank 1] Trainer log: {'loss': 0.5399, 'grad_norm': 6.091038703918457, 'learning_rate': 1.143003248414375e-06} {'loss': 0.5399, 'grad_norm': 6.091038703918457, 'learning_rate': 1.143003248414375e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 1.0603, 'grad_norm': 2.2537269592285156, 'learning_rate': 1.1398394483233765e-06}[Rank 0] Trainer log: {'loss': 1.0603, 'grad_norm': 2.2537269592285156, 'learning_rate': 1.1398394483233765e-06} [Rank 1] Trainer log: {'loss': 1.0603, 'grad_norm': 2.2537269592285156, 'learning_rate': 1.1398394483233765e-06}[Rank 2] Trainer log: {'loss': 1.0603, 'grad_norm': 2.2537269592285156, 'learning_rate': 1.1398394483233765e-06} {'loss': 1.0603, 'grad_norm': 2.2537269592285156, 'learning_rate': 1.1398394483233765e-06, 'epoch': 0.85} [Rank 1] Trainer log: {'loss': 0.7426, 'grad_norm': 3.4085538387298584, 'learning_rate': 1.136679768305231e-06}[Rank 3] Trainer log: {'loss': 0.7426, 'grad_norm': 3.4085538387298584, 'learning_rate': 1.136679768305231e-06} [Rank 0] Trainer log: {'loss': 0.7426, 'grad_norm': 3.4085538387298584, 'learning_rate': 1.136679768305231e-06} [Rank 2] Trainer log: {'loss': 0.7426, 'grad_norm': 3.4085538387298584, 'learning_rate': 1.136679768305231e-06} {'loss': 0.7426, 'grad_norm': 3.4085538387298584, 'learning_rate': 1.136679768305231e-06, 'epoch': 0.85} [Rank 2] Trainer log: {'loss': 0.7925, 'grad_norm': 2.2854721546173096, 'learning_rate': 1.1335242098292265e-06}[Rank 3] Trainer log: {'loss': 0.7925, 'grad_norm': 2.2854721546173096, 'learning_rate': 1.1335242098292265e-06} [Rank 1] Trainer log: {'loss': 0.7925, 'grad_norm': 2.2854721546173096, 'learning_rate': 1.1335242098292265e-06} [Rank 0] Trainer log: {'loss': 0.7925, 'grad_norm': 2.2854721546173096, 'learning_rate': 1.1335242098292265e-06} {'loss': 0.7925, 'grad_norm': 2.2854721546173096, 'learning_rate': 1.1335242098292265e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.8086, 'grad_norm': 3.5862090587615967, 'learning_rate': 1.1303727743627356e-06}[Rank 1] Trainer log: {'loss': 0.8086, 'grad_norm': 3.5862090587615967, 'learning_rate': 1.1303727743627356e-06} [Rank 2] Trainer log: {'loss': 0.8086, 'grad_norm': 3.5862090587615967, 'learning_rate': 1.1303727743627356e-06} [Rank 0] Trainer log: {'loss': 0.8086, 'grad_norm': 3.5862090587615967, 'learning_rate': 1.1303727743627356e-06} {'loss': 0.8086, 'grad_norm': 3.5862090587615967, 'learning_rate': 1.1303727743627356e-06, 'epoch': 0.85} [Rank 1] Trainer log: {'loss': 0.7915, 'grad_norm': 8.363637924194336, 'learning_rate': 1.1272254633712053e-06}[Rank 2] Trainer log: {'loss': 0.7915, 'grad_norm': 8.363637924194336, 'learning_rate': 1.1272254633712053e-06} [Rank 3] Trainer log: {'loss': 0.7915, 'grad_norm': 8.363637924194336, 'learning_rate': 1.1272254633712053e-06} [Rank 0] Trainer log: {'loss': 0.7915, 'grad_norm': 8.363637924194336, 'learning_rate': 1.1272254633712053e-06} {'loss': 0.7915, 'grad_norm': 8.363637924194336, 'learning_rate': 1.1272254633712053e-06, 'epoch': 0.85} [Rank 3] Trainer log: {'loss': 0.9, 'grad_norm': 3.7336585521698, 'learning_rate': 1.124082278318176e-06}[Rank 0] Trainer log: {'loss': 0.9, 'grad_norm': 3.7336585521698, 'learning_rate': 1.124082278318176e-06}[Rank 1] Trainer log: {'loss': 0.9, 'grad_norm': 3.7336585521698, 'learning_rate': 1.124082278318176e-06} [Rank 2] Trainer log: {'loss': 0.9, 'grad_norm': 3.7336585521698, 'learning_rate': 1.124082278318176e-06} {'loss': 0.9, 'grad_norm': 3.7336585521698, 'learning_rate': 1.124082278318176e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.7083, 'grad_norm': 3.0705337524414062, 'learning_rate': 1.1209432206652593e-06} [Rank 1] Trainer log: {'loss': 0.7083, 'grad_norm': 3.0705337524414062, 'learning_rate': 1.1209432206652593e-06}[Rank 0] Trainer log: {'loss': 0.7083, 'grad_norm': 3.0705337524414062, 'learning_rate': 1.1209432206652593e-06} [Rank 2] Trainer log: {'loss': 0.7083, 'grad_norm': 3.0705337524414062, 'learning_rate': 1.1209432206652593e-06} {'loss': 0.7083, 'grad_norm': 3.0705337524414062, 'learning_rate': 1.1209432206652593e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.8317, 'grad_norm': 6.248520851135254, 'learning_rate': 1.117808291872151e-06} [Rank 2] Trainer log: {'loss': 0.8317, 'grad_norm': 6.248520851135254, 'learning_rate': 1.117808291872151e-06} [Rank 1] Trainer log: {'loss': 0.8317, 'grad_norm': 6.248520851135254, 'learning_rate': 1.117808291872151e-06}[Rank 0] Trainer log: {'loss': 0.8317, 'grad_norm': 6.248520851135254, 'learning_rate': 1.117808291872151e-06} {'loss': 0.8317, 'grad_norm': 6.248520851135254, 'learning_rate': 1.117808291872151e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.8684, 'grad_norm': 4.162790775299072, 'learning_rate': 1.114677493396632e-06}[Rank 1] Trainer log: {'loss': 0.8684, 'grad_norm': 4.162790775299072, 'learning_rate': 1.114677493396632e-06}[Rank 3] Trainer log: {'loss': 0.8684, 'grad_norm': 4.162790775299072, 'learning_rate': 1.114677493396632e-06} [Rank 2] Trainer log: {'loss': 0.8684, 'grad_norm': 4.162790775299072, 'learning_rate': 1.114677493396632e-06} {'loss': 0.8684, 'grad_norm': 4.162790775299072, 'learning_rate': 1.114677493396632e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.8011, 'grad_norm': 6.68202018737793, 'learning_rate': 1.1115508266945552e-06}[Rank 1] Trainer log: {'loss': 0.8011, 'grad_norm': 6.68202018737793, 'learning_rate': 1.1115508266945552e-06} [Rank 2] Trainer log: {'loss': 0.8011, 'grad_norm': 6.68202018737793, 'learning_rate': 1.1115508266945552e-06} [Rank 0] Trainer log: {'loss': 0.8011, 'grad_norm': 6.68202018737793, 'learning_rate': 1.1115508266945552e-06} {'loss': 0.8011, 'grad_norm': 6.68202018737793, 'learning_rate': 1.1115508266945552e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.9107, 'grad_norm': 3.2891006469726562, 'learning_rate': 1.1084282932198543e-06}[Rank 2] Trainer log: {'loss': 0.9107, 'grad_norm': 3.2891006469726562, 'learning_rate': 1.1084282932198543e-06}[Rank 1] Trainer log: {'loss': 0.9107, 'grad_norm': 3.2891006469726562, 'learning_rate': 1.1084282932198543e-06} [Rank 3] Trainer log: {'loss': 0.9107, 'grad_norm': 3.2891006469726562, 'learning_rate': 1.1084282932198543e-06} {'loss': 0.9107, 'grad_norm': 3.2891006469726562, 'learning_rate': 1.1084282932198543e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.8201, 'grad_norm': 4.499911785125732, 'learning_rate': 1.1053098944245432e-06}[Rank 0] Trainer log: {'loss': 0.8201, 'grad_norm': 4.499911785125732, 'learning_rate': 1.1053098944245432e-06}[Rank 3] Trainer log: {'loss': 0.8201, 'grad_norm': 4.499911785125732, 'learning_rate': 1.1053098944245432e-06} [Rank 2] Trainer log: {'loss': 0.8201, 'grad_norm': 4.499911785125732, 'learning_rate': 1.1053098944245432e-06} {'loss': 0.8201, 'grad_norm': 4.499911785125732, 'learning_rate': 1.1053098944245432e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.7358, 'grad_norm': 3.4209096431732178, 'learning_rate': 1.1021956317587157e-06}[Rank 0] Trainer log: {'loss': 0.7358, 'grad_norm': 3.4209096431732178, 'learning_rate': 1.1021956317587157e-06}[Rank 3] Trainer log: {'loss': 0.7358, 'grad_norm': 3.4209096431732178, 'learning_rate': 1.1021956317587157e-06} [Rank 2] Trainer log: {'loss': 0.7358, 'grad_norm': 3.4209096431732178, 'learning_rate': 1.1021956317587157e-06} {'loss': 0.7358, 'grad_norm': 3.4209096431732178, 'learning_rate': 1.1021956317587157e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.9605, 'grad_norm': 5.311986923217773, 'learning_rate': 1.099085506670532e-06}[Rank 1] Trainer log: {'loss': 0.9605, 'grad_norm': 5.311986923217773, 'learning_rate': 1.099085506670532e-06} [Rank 0] Trainer log: {'loss': 0.9605, 'grad_norm': 5.311986923217773, 'learning_rate': 1.099085506670532e-06} [Rank 2] Trainer log: {'loss': 0.9605, 'grad_norm': 5.311986923217773, 'learning_rate': 1.099085506670532e-06} {'loss': 0.9605, 'grad_norm': 5.311986923217773, 'learning_rate': 1.099085506670532e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.8623, 'grad_norm': 2.0517547130584717, 'learning_rate': 1.0959795206062395e-06}[Rank 1] Trainer log: {'loss': 0.8623, 'grad_norm': 2.0517547130584717, 'learning_rate': 1.0959795206062395e-06} [Rank 2] Trainer log: {'loss': 0.8623, 'grad_norm': 2.0517547130584717, 'learning_rate': 1.0959795206062395e-06}[Rank 0] Trainer log: {'loss': 0.8623, 'grad_norm': 2.0517547130584717, 'learning_rate': 1.0959795206062395e-06} {'loss': 0.8623, 'grad_norm': 2.0517547130584717, 'learning_rate': 1.0959795206062395e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.7888, 'grad_norm': 7.559993743896484, 'learning_rate': 1.0928776750101555e-06}[Rank 2] Trainer log: {'loss': 0.7888, 'grad_norm': 7.559993743896484, 'learning_rate': 1.0928776750101555e-06} [Rank 1] Trainer log: {'loss': 0.7888, 'grad_norm': 7.559993743896484, 'learning_rate': 1.0928776750101555e-06} [Rank 0] Trainer log: {'loss': 0.7888, 'grad_norm': 7.559993743896484, 'learning_rate': 1.0928776750101555e-06} {'loss': 0.7888, 'grad_norm': 7.559993743896484, 'learning_rate': 1.0928776750101555e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.8357, 'grad_norm': 3.790278911590576, 'learning_rate': 1.0897799713246705e-06}[Rank 3] Trainer log: {'loss': 0.8357, 'grad_norm': 3.790278911590576, 'learning_rate': 1.0897799713246705e-06}[Rank 2] Trainer log: {'loss': 0.8357, 'grad_norm': 3.790278911590576, 'learning_rate': 1.0897799713246705e-06} [Rank 1] Trainer log: {'loss': 0.8357, 'grad_norm': 3.790278911590576, 'learning_rate': 1.0897799713246705e-06} {'loss': 0.8357, 'grad_norm': 3.790278911590576, 'learning_rate': 1.0897799713246705e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.6918, 'grad_norm': 3.402791738510132, 'learning_rate': 1.086686410990251e-06}[Rank 1] Trainer log: {'loss': 0.6918, 'grad_norm': 3.402791738510132, 'learning_rate': 1.086686410990251e-06} [Rank 2] Trainer log: {'loss': 0.6918, 'grad_norm': 3.402791738510132, 'learning_rate': 1.086686410990251e-06}[Rank 3] Trainer log: {'loss': 0.6918, 'grad_norm': 3.402791738510132, 'learning_rate': 1.086686410990251e-06} {'loss': 0.6918, 'grad_norm': 3.402791738510132, 'learning_rate': 1.086686410990251e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.8131, 'grad_norm': 4.844381332397461, 'learning_rate': 1.0835969954454395e-06}[Rank 0] Trainer log: {'loss': 0.8131, 'grad_norm': 4.844381332397461, 'learning_rate': 1.0835969954454395e-06} [Rank 2] Trainer log: {'loss': 0.8131, 'grad_norm': 4.844381332397461, 'learning_rate': 1.0835969954454395e-06} [Rank 1] Trainer log: {'loss': 0.8131, 'grad_norm': 4.844381332397461, 'learning_rate': 1.0835969954454395e-06} {'loss': 0.8131, 'grad_norm': 4.844381332397461, 'learning_rate': 1.0835969954454395e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.9896, 'grad_norm': 2.367107629776001, 'learning_rate': 1.080511726126845e-06}[Rank 1] Trainer log: {'loss': 0.9896, 'grad_norm': 2.367107629776001, 'learning_rate': 1.080511726126845e-06} [Rank 0] Trainer log: {'loss': 0.9896, 'grad_norm': 2.367107629776001, 'learning_rate': 1.080511726126845e-06} [Rank 2] Trainer log: {'loss': 0.9896, 'grad_norm': 2.367107629776001, 'learning_rate': 1.080511726126845e-06} {'loss': 0.9896, 'grad_norm': 2.367107629776001, 'learning_rate': 1.080511726126845e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.8529, 'grad_norm': 3.701777458190918, 'learning_rate': 1.077430604469153e-06} [Rank 1] Trainer log: {'loss': 0.8529, 'grad_norm': 3.701777458190918, 'learning_rate': 1.077430604469153e-06} [Rank 0] Trainer log: {'loss': 0.8529, 'grad_norm': 3.701777458190918, 'learning_rate': 1.077430604469153e-06} [Rank 2] Trainer log: {'loss': 0.8529, 'grad_norm': 3.701777458190918, 'learning_rate': 1.077430604469153e-06} {'loss': 0.8529, 'grad_norm': 3.701777458190918, 'learning_rate': 1.077430604469153e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 1.0101, 'grad_norm': 4.527693748474121, 'learning_rate': 1.074353631905123e-06}[Rank 0] Trainer log: {'loss': 1.0101, 'grad_norm': 4.527693748474121, 'learning_rate': 1.074353631905123e-06}[Rank 1] Trainer log: {'loss': 1.0101, 'grad_norm': 4.527693748474121, 'learning_rate': 1.074353631905123e-06} [Rank 2] Trainer log: {'loss': 1.0101, 'grad_norm': 4.527693748474121, 'learning_rate': 1.074353631905123e-06} {'loss': 1.0101, 'grad_norm': 4.527693748474121, 'learning_rate': 1.074353631905123e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.7367, 'grad_norm': 1.5914065837860107, 'learning_rate': 1.071280809865578e-06}[Rank 3] Trainer log: {'loss': 0.7367, 'grad_norm': 1.5914065837860107, 'learning_rate': 1.071280809865578e-06}[Rank 2] Trainer log: {'loss': 0.7367, 'grad_norm': 1.5914065837860107, 'learning_rate': 1.071280809865578e-06} [Rank 1] Trainer log: {'loss': 0.7367, 'grad_norm': 1.5914065837860107, 'learning_rate': 1.071280809865578e-06} {'loss': 0.7367, 'grad_norm': 1.5914065837860107, 'learning_rate': 1.071280809865578e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.7175, 'grad_norm': 8.545585632324219, 'learning_rate': 1.0682121397794131e-06} [Rank 1] Trainer log: {'loss': 0.7175, 'grad_norm': 8.545585632324219, 'learning_rate': 1.0682121397794131e-06} [Rank 2] Trainer log: {'loss': 0.7175, 'grad_norm': 8.545585632324219, 'learning_rate': 1.0682121397794131e-06} [Rank 0] Trainer log: {'loss': 0.7175, 'grad_norm': 8.545585632324219, 'learning_rate': 1.0682121397794131e-06} {'loss': 0.7175, 'grad_norm': 8.545585632324219, 'learning_rate': 1.0682121397794131e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.9305, 'grad_norm': 6.259365081787109, 'learning_rate': 1.0651476230735979e-06}[Rank 3] Trainer log: {'loss': 0.9305, 'grad_norm': 6.259365081787109, 'learning_rate': 1.0651476230735979e-06}[Rank 0] Trainer log: {'loss': 0.9305, 'grad_norm': 6.259365081787109, 'learning_rate': 1.0651476230735979e-06} [Rank 2] Trainer log: {'loss': 0.9305, 'grad_norm': 6.259365081787109, 'learning_rate': 1.0651476230735979e-06} {'loss': 0.9305, 'grad_norm': 6.259365081787109, 'learning_rate': 1.0651476230735979e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.7601, 'grad_norm': 18.78659439086914, 'learning_rate': 1.0620872611731636e-06}[Rank 3] Trainer log: {'loss': 0.7601, 'grad_norm': 18.78659439086914, 'learning_rate': 1.0620872611731636e-06} [Rank 0] Trainer log: {'loss': 0.7601, 'grad_norm': 18.78659439086914, 'learning_rate': 1.0620872611731636e-06} [Rank 2] Trainer log: {'loss': 0.7601, 'grad_norm': 18.78659439086914, 'learning_rate': 1.0620872611731636e-06} {'loss': 0.7601, 'grad_norm': 18.78659439086914, 'learning_rate': 1.0620872611731636e-06, 'epoch': 0.86} [Rank 2] Trainer log: {'loss': 1.0566, 'grad_norm': 2.383410692214966, 'learning_rate': 1.0590310555012106e-06}[Rank 3] Trainer log: {'loss': 1.0566, 'grad_norm': 2.383410692214966, 'learning_rate': 1.0590310555012106e-06} [Rank 0] Trainer log: {'loss': 1.0566, 'grad_norm': 2.383410692214966, 'learning_rate': 1.0590310555012106e-06} [Rank 1] Trainer log: {'loss': 1.0566, 'grad_norm': 2.383410692214966, 'learning_rate': 1.0590310555012106e-06} {'loss': 1.0566, 'grad_norm': 2.383410692214966, 'learning_rate': 1.0590310555012106e-06, 'epoch': 0.86} [Rank 2] Trainer log: {'loss': 1.0008, 'grad_norm': 2.8948845863342285, 'learning_rate': 1.0559790074789134e-06} [Rank 1] Trainer log: {'loss': 1.0008, 'grad_norm': 2.8948845863342285, 'learning_rate': 1.0559790074789134e-06} [Rank 0] Trainer log: {'loss': 1.0008, 'grad_norm': 2.8948845863342285, 'learning_rate': 1.0559790074789134e-06}[Rank 3] Trainer log: {'loss': 1.0008, 'grad_norm': 2.8948845863342285, 'learning_rate': 1.0559790074789134e-06} {'loss': 1.0008, 'grad_norm': 2.8948845863342285, 'learning_rate': 1.0559790074789134e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.6125, 'grad_norm': 2.1941213607788086, 'learning_rate': 1.0529311185255032e-06}[Rank 3] Trainer log: {'loss': 0.6125, 'grad_norm': 2.1941213607788086, 'learning_rate': 1.0529311185255032e-06}[Rank 0] Trainer log: {'loss': 0.6125, 'grad_norm': 2.1941213607788086, 'learning_rate': 1.0529311185255032e-06} [Rank 2] Trainer log: {'loss': 0.6125, 'grad_norm': 2.1941213607788086, 'learning_rate': 1.0529311185255032e-06} {'loss': 0.6125, 'grad_norm': 2.1941213607788086, 'learning_rate': 1.0529311185255032e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.8833, 'grad_norm': 2.1396842002868652, 'learning_rate': 1.049887390058285e-06} [Rank 3] Trainer log: {'loss': 0.8833, 'grad_norm': 2.1396842002868652, 'learning_rate': 1.049887390058285e-06} [Rank 2] Trainer log: {'loss': 0.8833, 'grad_norm': 2.1396842002868652, 'learning_rate': 1.049887390058285e-06} [Rank 0] Trainer log: {'loss': 0.8833, 'grad_norm': 2.1396842002868652, 'learning_rate': 1.049887390058285e-06} {'loss': 0.8833, 'grad_norm': 2.1396842002868652, 'learning_rate': 1.049887390058285e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.961, 'grad_norm': 2.83878231048584, 'learning_rate': 1.0468478234926272e-06}[Rank 1] Trainer log: {'loss': 0.961, 'grad_norm': 2.83878231048584, 'learning_rate': 1.0468478234926272e-06}[Rank 2] Trainer log: {'loss': 0.961, 'grad_norm': 2.83878231048584, 'learning_rate': 1.0468478234926272e-06} [Rank 3] Trainer log: {'loss': 0.961, 'grad_norm': 2.83878231048584, 'learning_rate': 1.0468478234926272e-06} {'loss': 0.961, 'grad_norm': 2.83878231048584, 'learning_rate': 1.0468478234926272e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.6219, 'grad_norm': 2.5849356651306152, 'learning_rate': 1.043812420241961e-06}[Rank 3] Trainer log: {'loss': 0.6219, 'grad_norm': 2.5849356651306152, 'learning_rate': 1.043812420241961e-06}[Rank 1] Trainer log: {'loss': 0.6219, 'grad_norm': 2.5849356651306152, 'learning_rate': 1.043812420241961e-06} [Rank 2] Trainer log: {'loss': 0.6219, 'grad_norm': 2.5849356651306152, 'learning_rate': 1.043812420241961e-06} {'loss': 0.6219, 'grad_norm': 2.5849356651306152, 'learning_rate': 1.043812420241961e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.8951, 'grad_norm': 4.619565010070801, 'learning_rate': 1.0407811817177805e-06}[Rank 1] Trainer log: {'loss': 0.8951, 'grad_norm': 4.619565010070801, 'learning_rate': 1.0407811817177805e-06} [Rank 3] Trainer log: {'loss': 0.8951, 'grad_norm': 4.619565010070801, 'learning_rate': 1.0407811817177805e-06} [Rank 2] Trainer log: {'loss': 0.8951, 'grad_norm': 4.619565010070801, 'learning_rate': 1.0407811817177805e-06} {'loss': 0.8951, 'grad_norm': 4.619565010070801, 'learning_rate': 1.0407811817177805e-06, 'epoch': 0.86} [Rank 2] Trainer log: {'loss': 0.7433, 'grad_norm': 4.64235782623291, 'learning_rate': 1.0377541093296484e-06}[Rank 3] Trainer log: {'loss': 0.7433, 'grad_norm': 4.64235782623291, 'learning_rate': 1.0377541093296484e-06} [Rank 0] Trainer log: {'loss': 0.7433, 'grad_norm': 4.64235782623291, 'learning_rate': 1.0377541093296484e-06}[Rank 1] Trainer log: {'loss': 0.7433, 'grad_norm': 4.64235782623291, 'learning_rate': 1.0377541093296484e-06} {'loss': 0.7433, 'grad_norm': 4.64235782623291, 'learning_rate': 1.0377541093296484e-06, 'epoch': 0.86} [Rank 2] Trainer log: {'loss': 0.886, 'grad_norm': 8.731816291809082, 'learning_rate': 1.0347312044851855e-06}[Rank 0] Trainer log: {'loss': 0.886, 'grad_norm': 8.731816291809082, 'learning_rate': 1.0347312044851855e-06} [Rank 3] Trainer log: {'loss': 0.886, 'grad_norm': 8.731816291809082, 'learning_rate': 1.0347312044851855e-06} [Rank 1] Trainer log: {'loss': 0.886, 'grad_norm': 8.731816291809082, 'learning_rate': 1.0347312044851855e-06} {'loss': 0.886, 'grad_norm': 8.731816291809082, 'learning_rate': 1.0347312044851855e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.5758, 'grad_norm': 1.6560418605804443, 'learning_rate': 1.0317124685900738e-06} [Rank 0] Trainer log: {'loss': 0.5758, 'grad_norm': 1.6560418605804443, 'learning_rate': 1.0317124685900738e-06} [Rank 2] Trainer log: {'loss': 0.5758, 'grad_norm': 1.6560418605804443, 'learning_rate': 1.0317124685900738e-06} [Rank 1] Trainer log: {'loss': 0.5758, 'grad_norm': 1.6560418605804443, 'learning_rate': 1.0317124685900738e-06} {'loss': 0.5758, 'grad_norm': 1.6560418605804443, 'learning_rate': 1.0317124685900738e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.7695, 'grad_norm': 3.6948626041412354, 'learning_rate': 1.0286979030480627e-06} [Rank 1] Trainer log: {'loss': 0.7695, 'grad_norm': 3.6948626041412354, 'learning_rate': 1.0286979030480627e-06}[Rank 0] Trainer log: {'loss': 0.7695, 'grad_norm': 3.6948626041412354, 'learning_rate': 1.0286979030480627e-06} [Rank 2] Trainer log: {'loss': 0.7695, 'grad_norm': 3.6948626041412354, 'learning_rate': 1.0286979030480627e-06} {'loss': 0.7695, 'grad_norm': 3.6948626041412354, 'learning_rate': 1.0286979030480627e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.9661, 'grad_norm': 6.183495998382568, 'learning_rate': 1.0256875092609553e-06}[Rank 2] Trainer log: {'loss': 0.9661, 'grad_norm': 6.183495998382568, 'learning_rate': 1.0256875092609553e-06}[Rank 1] Trainer log: {'loss': 0.9661, 'grad_norm': 6.183495998382568, 'learning_rate': 1.0256875092609553e-06} [Rank 0] Trainer log: {'loss': 0.9661, 'grad_norm': 6.183495998382568, 'learning_rate': 1.0256875092609553e-06} {'loss': 0.9661, 'grad_norm': 6.183495998382568, 'learning_rate': 1.0256875092609553e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.9392, 'grad_norm': 1.8786330223083496, 'learning_rate': 1.0226812886286196e-06} [Rank 2] Trainer log: {'loss': 0.9392, 'grad_norm': 1.8786330223083496, 'learning_rate': 1.0226812886286196e-06} [Rank 0] Trainer log: {'loss': 0.9392, 'grad_norm': 1.8786330223083496, 'learning_rate': 1.0226812886286196e-06}[Rank 3] Trainer log: {'loss': 0.9392, 'grad_norm': 1.8786330223083496, 'learning_rate': 1.0226812886286196e-06} {'loss': 0.9392, 'grad_norm': 1.8786330223083496, 'learning_rate': 1.0226812886286196e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.9097, 'grad_norm': 4.503698825836182, 'learning_rate': 1.0196792425489833e-06}[Rank 3] Trainer log: {'loss': 0.9097, 'grad_norm': 4.503698825836182, 'learning_rate': 1.0196792425489833e-06} [Rank 1] Trainer log: {'loss': 0.9097, 'grad_norm': 4.503698825836182, 'learning_rate': 1.0196792425489833e-06} [Rank 2] Trainer log: {'loss': 0.9097, 'grad_norm': 4.503698825836182, 'learning_rate': 1.0196792425489833e-06} {'loss': 0.9097, 'grad_norm': 4.503698825836182, 'learning_rate': 1.0196792425489833e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.9532, 'grad_norm': 5.394480228424072, 'learning_rate': 1.016681372418029e-06}[Rank 0] Trainer log: {'loss': 0.9532, 'grad_norm': 5.394480228424072, 'learning_rate': 1.016681372418029e-06}[Rank 3] Trainer log: {'loss': 0.9532, 'grad_norm': 5.394480228424072, 'learning_rate': 1.016681372418029e-06} [Rank 2] Trainer log: {'loss': 0.9532, 'grad_norm': 5.394480228424072, 'learning_rate': 1.016681372418029e-06} {'loss': 0.9532, 'grad_norm': 5.394480228424072, 'learning_rate': 1.016681372418029e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.5469, 'grad_norm': 5.536595821380615, 'learning_rate': 1.0136876796298012e-06}[Rank 3] Trainer log: {'loss': 0.5469, 'grad_norm': 5.536595821380615, 'learning_rate': 1.0136876796298012e-06}[Rank 2] Trainer log: {'loss': 0.5469, 'grad_norm': 5.536595821380615, 'learning_rate': 1.0136876796298012e-06} [Rank 1] Trainer log: {'loss': 0.5469, 'grad_norm': 5.536595821380615, 'learning_rate': 1.0136876796298012e-06} {'loss': 0.5469, 'grad_norm': 5.536595821380615, 'learning_rate': 1.0136876796298012e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 0.7744, 'grad_norm': 10.332438468933105, 'learning_rate': 1.0106981655763959e-06}[Rank 3] Trainer log: {'loss': 0.7744, 'grad_norm': 10.332438468933105, 'learning_rate': 1.0106981655763959e-06}[Rank 1] Trainer log: {'loss': 0.7744, 'grad_norm': 10.332438468933105, 'learning_rate': 1.0106981655763959e-06} [Rank 2] Trainer log: {'loss': 0.7744, 'grad_norm': 10.332438468933105, 'learning_rate': 1.0106981655763959e-06} {'loss': 0.7744, 'grad_norm': 10.332438468933105, 'learning_rate': 1.0106981655763959e-06, 'epoch': 0.86} [Rank 2] Trainer log: {'loss': 0.9016, 'grad_norm': 3.3828930854797363, 'learning_rate': 1.0077128316479768e-06} [Rank 3] Trainer log: {'loss': 0.9016, 'grad_norm': 3.3828930854797363, 'learning_rate': 1.0077128316479768e-06} [Rank 1] Trainer log: {'loss': 0.9016, 'grad_norm': 3.3828930854797363, 'learning_rate': 1.0077128316479768e-06} [Rank 0] Trainer log: {'loss': 0.9016, 'grad_norm': 3.3828930854797363, 'learning_rate': 1.0077128316479768e-06} {'loss': 0.9016, 'grad_norm': 3.3828930854797363, 'learning_rate': 1.0077128316479768e-06, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 1.0566, 'grad_norm': 2.470381736755371, 'learning_rate': 1.00473167923275e-06} [Rank 2] Trainer log: {'loss': 1.0566, 'grad_norm': 2.470381736755371, 'learning_rate': 1.00473167923275e-06}[Rank 0] Trainer log: {'loss': 1.0566, 'grad_norm': 2.470381736755371, 'learning_rate': 1.00473167923275e-06} [Rank 3] Trainer log: {'loss': 1.0566, 'grad_norm': 2.470381736755371, 'learning_rate': 1.00473167923275e-06} {'loss': 1.0566, 'grad_norm': 2.470381736755371, 'learning_rate': 1.00473167923275e-06, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.6365, 'grad_norm': 6.542340278625488, 'learning_rate': 1.0017547097169888e-06}[Rank 0] Trainer log: {'loss': 0.6365, 'grad_norm': 6.542340278625488, 'learning_rate': 1.0017547097169888e-06} [Rank 1] Trainer log: {'loss': 0.6365, 'grad_norm': 6.542340278625488, 'learning_rate': 1.0017547097169888e-06} [Rank 2] Trainer log: {'loss': 0.6365, 'grad_norm': 6.542340278625488, 'learning_rate': 1.0017547097169888e-06} {'loss': 0.6365, 'grad_norm': 6.542340278625488, 'learning_rate': 1.0017547097169888e-06, 'epoch': 0.86} [Rank 0] Trainer log: {'loss': 1.0421, 'grad_norm': 7.6987104415893555, 'learning_rate': 9.987819244850195e-07}[Rank 3] Trainer log: {'loss': 1.0421, 'grad_norm': 7.6987104415893555, 'learning_rate': 9.987819244850195e-07}[Rank 1] Trainer log: {'loss': 1.0421, 'grad_norm': 7.6987104415893555, 'learning_rate': 9.987819244850195e-07} [Rank 2] Trainer log: {'loss': 1.0421, 'grad_norm': 7.6987104415893555, 'learning_rate': 9.987819244850195e-07} {'loss': 1.0421, 'grad_norm': 7.6987104415893555, 'learning_rate': 9.987819244850195e-07, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.9262, 'grad_norm': 3.920886278152466, 'learning_rate': 9.95813324919216e-07} [Rank 2] Trainer log: {'loss': 0.9262, 'grad_norm': 3.920886278152466, 'learning_rate': 9.95813324919216e-07}[Rank 0] Trainer log: {'loss': 0.9262, 'grad_norm': 3.920886278152466, 'learning_rate': 9.95813324919216e-07} [Rank 1] Trainer log: {'loss': 0.9262, 'grad_norm': 3.920886278152466, 'learning_rate': 9.95813324919216e-07} {'loss': 0.9262, 'grad_norm': 3.920886278152466, 'learning_rate': 9.95813324919216e-07, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.8906, 'grad_norm': 4.587650299072266, 'learning_rate': 9.928489124000096e-07}[Rank 0] Trainer log: {'loss': 0.8906, 'grad_norm': 4.587650299072266, 'learning_rate': 9.928489124000096e-07} [Rank 1] Trainer log: {'loss': 0.8906, 'grad_norm': 4.587650299072266, 'learning_rate': 9.928489124000096e-07} [Rank 2] Trainer log: {'loss': 0.8906, 'grad_norm': 4.587650299072266, 'learning_rate': 9.928489124000096e-07} {'loss': 0.8906, 'grad_norm': 4.587650299072266, 'learning_rate': 9.928489124000096e-07, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.8755, 'grad_norm': 10.558819770812988, 'learning_rate': 9.898886883058878e-07}[Rank 0] Trainer log: {'loss': 0.8755, 'grad_norm': 10.558819770812988, 'learning_rate': 9.898886883058878e-07} [Rank 2] Trainer log: {'loss': 0.8755, 'grad_norm': 10.558819770812988, 'learning_rate': 9.898886883058878e-07} [Rank 1] Trainer log: {'loss': 0.8755, 'grad_norm': 10.558819770812988, 'learning_rate': 9.898886883058878e-07} {'loss': 0.8755, 'grad_norm': 10.558819770812988, 'learning_rate': 9.898886883058878e-07, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 0.7955, 'grad_norm': 13.2160005569458, 'learning_rate': 9.869326540133873e-07}[Rank 2] Trainer log: {'loss': 0.7955, 'grad_norm': 13.2160005569458, 'learning_rate': 9.869326540133873e-07}[Rank 3] Trainer log: {'loss': 0.7955, 'grad_norm': 13.2160005569458, 'learning_rate': 9.869326540133873e-07} [Rank 0] Trainer log: {'loss': 0.7955, 'grad_norm': 13.2160005569458, 'learning_rate': 9.869326540133873e-07} {'loss': 0.7955, 'grad_norm': 13.2160005569458, 'learning_rate': 9.869326540133873e-07, 'epoch': 0.86} [Rank 1] Trainer log: {'loss': 1.0633, 'grad_norm': 7.674502849578857, 'learning_rate': 9.839808108970927e-07} [Rank 3] Trainer log: {'loss': 1.0633, 'grad_norm': 7.674502849578857, 'learning_rate': 9.839808108970927e-07}[Rank 2] Trainer log: {'loss': 1.0633, 'grad_norm': 7.674502849578857, 'learning_rate': 9.839808108970927e-07} [Rank 0] Trainer log: {'loss': 1.0633, 'grad_norm': 7.674502849578857, 'learning_rate': 9.839808108970927e-07} {'loss': 1.0633, 'grad_norm': 7.674502849578857, 'learning_rate': 9.839808108970927e-07, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.921, 'grad_norm': 5.265581130981445, 'learning_rate': 9.810331603296498e-07}[Rank 1] Trainer log: {'loss': 0.921, 'grad_norm': 5.265581130981445, 'learning_rate': 9.810331603296498e-07}[Rank 0] Trainer log: {'loss': 0.921, 'grad_norm': 5.265581130981445, 'learning_rate': 9.810331603296498e-07} [Rank 2] Trainer log: {'loss': 0.921, 'grad_norm': 5.265581130981445, 'learning_rate': 9.810331603296498e-07} {'loss': 0.921, 'grad_norm': 5.265581130981445, 'learning_rate': 9.810331603296498e-07, 'epoch': 0.86} [Rank 3] Trainer log: {'loss': 0.9478, 'grad_norm': 4.5104498863220215, 'learning_rate': 9.780897036817438e-07}[Rank 1] Trainer log: {'loss': 0.9478, 'grad_norm': 4.5104498863220215, 'learning_rate': 9.780897036817438e-07} [Rank 0] Trainer log: {'loss': 0.9478, 'grad_norm': 4.5104498863220215, 'learning_rate': 9.780897036817438e-07} [Rank 2] Trainer log: {'loss': 0.9478, 'grad_norm': 4.5104498863220215, 'learning_rate': 9.780897036817438e-07} {'loss': 0.9478, 'grad_norm': 4.5104498863220215, 'learning_rate': 9.780897036817438e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 0.9861, 'grad_norm': 1.9686331748962402, 'learning_rate': 9.751504423221158e-07}[Rank 3] Trainer log: {'loss': 0.9861, 'grad_norm': 1.9686331748962402, 'learning_rate': 9.751504423221158e-07}[Rank 2] Trainer log: {'loss': 0.9861, 'grad_norm': 1.9686331748962402, 'learning_rate': 9.751504423221158e-07} [Rank 1] Trainer log: {'loss': 0.9861, 'grad_norm': 1.9686331748962402, 'learning_rate': 9.751504423221158e-07} {'loss': 0.9861, 'grad_norm': 1.9686331748962402, 'learning_rate': 9.751504423221158e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 0.8307, 'grad_norm': 3.493302345275879, 'learning_rate': 9.722153776175592e-07}[Rank 3] Trainer log: {'loss': 0.8307, 'grad_norm': 3.493302345275879, 'learning_rate': 9.722153776175592e-07} [Rank 1] Trainer log: {'loss': 0.8307, 'grad_norm': 3.493302345275879, 'learning_rate': 9.722153776175592e-07} [Rank 2] Trainer log: {'loss': 0.8307, 'grad_norm': 3.493302345275879, 'learning_rate': 9.722153776175592e-07} {'loss': 0.8307, 'grad_norm': 3.493302345275879, 'learning_rate': 9.722153776175592e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8862, 'grad_norm': 7.099724292755127, 'learning_rate': 9.692845109329074e-07}[Rank 1] Trainer log: {'loss': 0.8862, 'grad_norm': 7.099724292755127, 'learning_rate': 9.692845109329074e-07} [Rank 0] Trainer log: {'loss': 0.8862, 'grad_norm': 7.099724292755127, 'learning_rate': 9.692845109329074e-07} [Rank 2] Trainer log: {'loss': 0.8862, 'grad_norm': 7.099724292755127, 'learning_rate': 9.692845109329074e-07} {'loss': 0.8862, 'grad_norm': 7.099724292755127, 'learning_rate': 9.692845109329074e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8587, 'grad_norm': 3.532132148742676, 'learning_rate': 9.66357843631045e-07}[Rank 1] Trainer log: {'loss': 0.8587, 'grad_norm': 3.532132148742676, 'learning_rate': 9.66357843631045e-07}[Rank 2] Trainer log: {'loss': 0.8587, 'grad_norm': 3.532132148742676, 'learning_rate': 9.66357843631045e-07} [Rank 0] Trainer log: {'loss': 0.8587, 'grad_norm': 3.532132148742676, 'learning_rate': 9.66357843631045e-07} {'loss': 0.8587, 'grad_norm': 3.532132148742676, 'learning_rate': 9.66357843631045e-07, 'epoch': 0.87} [Rank 1] Trainer log: {'loss': 0.7219, 'grad_norm': 8.560393333435059, 'learning_rate': 9.634353770729076e-07}[Rank 3] Trainer log: {'loss': 0.7219, 'grad_norm': 8.560393333435059, 'learning_rate': 9.634353770729076e-07}[Rank 2] Trainer log: {'loss': 0.7219, 'grad_norm': 8.560393333435059, 'learning_rate': 9.634353770729076e-07} [Rank 0] Trainer log: {'loss': 0.7219, 'grad_norm': 8.560393333435059, 'learning_rate': 9.634353770729076e-07} {'loss': 0.7219, 'grad_norm': 8.560393333435059, 'learning_rate': 9.634353770729076e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.7879, 'grad_norm': 2.1142852306365967, 'learning_rate': 9.605171126174751e-07}[Rank 1] Trainer log: {'loss': 0.7879, 'grad_norm': 2.1142852306365967, 'learning_rate': 9.605171126174751e-07} [Rank 0] Trainer log: {'loss': 0.7879, 'grad_norm': 2.1142852306365967, 'learning_rate': 9.605171126174751e-07}[Rank 2] Trainer log: {'loss': 0.7879, 'grad_norm': 2.1142852306365967, 'learning_rate': 9.605171126174751e-07} {'loss': 0.7879, 'grad_norm': 2.1142852306365967, 'learning_rate': 9.605171126174751e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.9149, 'grad_norm': 3.639902114868164, 'learning_rate': 9.576030516217672e-07}[Rank 1] Trainer log: {'loss': 0.9149, 'grad_norm': 3.639902114868164, 'learning_rate': 9.576030516217672e-07} [Rank 2] Trainer log: {'loss': 0.9149, 'grad_norm': 3.639902114868164, 'learning_rate': 9.576030516217672e-07} [Rank 0] Trainer log: {'loss': 0.9149, 'grad_norm': 3.639902114868164, 'learning_rate': 9.576030516217672e-07} {'loss': 0.9149, 'grad_norm': 3.639902114868164, 'learning_rate': 9.576030516217672e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 1.0273, 'grad_norm': 2.738520383834839, 'learning_rate': 9.546931954408622e-07}[Rank 1] Trainer log: {'loss': 1.0273, 'grad_norm': 2.738520383834839, 'learning_rate': 9.546931954408622e-07} [Rank 3] Trainer log: {'loss': 1.0273, 'grad_norm': 2.738520383834839, 'learning_rate': 9.546931954408622e-07}[Rank 2] Trainer log: {'loss': 1.0273, 'grad_norm': 2.738520383834839, 'learning_rate': 9.546931954408622e-07} {'loss': 1.0273, 'grad_norm': 2.738520383834839, 'learning_rate': 9.546931954408622e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 0.6798, 'grad_norm': 7.957915306091309, 'learning_rate': 9.517875454278691e-07}[Rank 3] Trainer log: {'loss': 0.6798, 'grad_norm': 7.957915306091309, 'learning_rate': 9.517875454278691e-07}[Rank 1] Trainer log: {'loss': 0.6798, 'grad_norm': 7.957915306091309, 'learning_rate': 9.517875454278691e-07} [Rank 2] Trainer log: {'loss': 0.6798, 'grad_norm': 7.957915306091309, 'learning_rate': 9.517875454278691e-07} {'loss': 0.6798, 'grad_norm': 7.957915306091309, 'learning_rate': 9.517875454278691e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8646, 'grad_norm': 4.401297092437744, 'learning_rate': 9.488861029339502e-07}[Rank 1] Trainer log: {'loss': 0.8646, 'grad_norm': 4.401297092437744, 'learning_rate': 9.488861029339502e-07} [Rank 0] Trainer log: {'loss': 0.8646, 'grad_norm': 4.401297092437744, 'learning_rate': 9.488861029339502e-07} [Rank 2] Trainer log: {'loss': 0.8646, 'grad_norm': 4.401297092437744, 'learning_rate': 9.488861029339502e-07} {'loss': 0.8646, 'grad_norm': 4.401297092437744, 'learning_rate': 9.488861029339502e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.7323, 'grad_norm': 3.183652400970459, 'learning_rate': 9.459888693083097e-07} [Rank 0] Trainer log: {'loss': 0.7323, 'grad_norm': 3.183652400970459, 'learning_rate': 9.459888693083097e-07} [Rank 1] Trainer log: {'loss': 0.7323, 'grad_norm': 3.183652400970459, 'learning_rate': 9.459888693083097e-07}[Rank 2] Trainer log: {'loss': 0.7323, 'grad_norm': 3.183652400970459, 'learning_rate': 9.459888693083097e-07} {'loss': 0.7323, 'grad_norm': 3.183652400970459, 'learning_rate': 9.459888693083097e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.6512, 'grad_norm': 8.01297664642334, 'learning_rate': 9.43095845898192e-07}[Rank 1] Trainer log: {'loss': 0.6512, 'grad_norm': 8.01297664642334, 'learning_rate': 9.43095845898192e-07}[Rank 2] Trainer log: {'loss': 0.6512, 'grad_norm': 8.01297664642334, 'learning_rate': 9.43095845898192e-07} [Rank 0] Trainer log: {'loss': 0.6512, 'grad_norm': 8.01297664642334, 'learning_rate': 9.43095845898192e-07} {'loss': 0.6512, 'grad_norm': 8.01297664642334, 'learning_rate': 9.43095845898192e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8196, 'grad_norm': 8.231982231140137, 'learning_rate': 9.402070340488845e-07} [Rank 0] Trainer log: {'loss': 0.8196, 'grad_norm': 8.231982231140137, 'learning_rate': 9.402070340488845e-07}[Rank 2] Trainer log: {'loss': 0.8196, 'grad_norm': 8.231982231140137, 'learning_rate': 9.402070340488845e-07} [Rank 1] Trainer log: {'loss': 0.8196, 'grad_norm': 8.231982231140137, 'learning_rate': 9.402070340488845e-07} {'loss': 0.8196, 'grad_norm': 8.231982231140137, 'learning_rate': 9.402070340488845e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.9681, 'grad_norm': 7.954078674316406, 'learning_rate': 9.373224351037158e-07} [Rank 0] Trainer log: {'loss': 0.9681, 'grad_norm': 7.954078674316406, 'learning_rate': 9.373224351037158e-07}[Rank 1] Trainer log: {'loss': 0.9681, 'grad_norm': 7.954078674316406, 'learning_rate': 9.373224351037158e-07} [Rank 2] Trainer log: {'loss': 0.9681, 'grad_norm': 7.954078674316406, 'learning_rate': 9.373224351037158e-07} {'loss': 0.9681, 'grad_norm': 7.954078674316406, 'learning_rate': 9.373224351037158e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.7479, 'grad_norm': 4.070235729217529, 'learning_rate': 9.344420504040597e-07} [Rank 2] Trainer log: {'loss': 0.7479, 'grad_norm': 4.070235729217529, 'learning_rate': 9.344420504040597e-07} [Rank 1] Trainer log: {'loss': 0.7479, 'grad_norm': 4.070235729217529, 'learning_rate': 9.344420504040597e-07} [Rank 0] Trainer log: {'loss': 0.7479, 'grad_norm': 4.070235729217529, 'learning_rate': 9.344420504040597e-07} {'loss': 0.7479, 'grad_norm': 4.070235729217529, 'learning_rate': 9.344420504040597e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8814, 'grad_norm': 8.840169906616211, 'learning_rate': 9.315658812893258e-07}[Rank 0] Trainer log: {'loss': 0.8814, 'grad_norm': 8.840169906616211, 'learning_rate': 9.315658812893258e-07} [Rank 2] Trainer log: {'loss': 0.8814, 'grad_norm': 8.840169906616211, 'learning_rate': 9.315658812893258e-07} [Rank 1] Trainer log: {'loss': 0.8814, 'grad_norm': 8.840169906616211, 'learning_rate': 9.315658812893258e-07} {'loss': 0.8814, 'grad_norm': 8.840169906616211, 'learning_rate': 9.315658812893258e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.8868, 'grad_norm': 3.2721757888793945, 'learning_rate': 9.28693929096961e-07}[Rank 3] Trainer log: {'loss': 0.8868, 'grad_norm': 3.2721757888793945, 'learning_rate': 9.28693929096961e-07}[Rank 1] Trainer log: {'loss': 0.8868, 'grad_norm': 3.2721757888793945, 'learning_rate': 9.28693929096961e-07} [Rank 0] Trainer log: {'loss': 0.8868, 'grad_norm': 3.2721757888793945, 'learning_rate': 9.28693929096961e-07} {'loss': 0.8868, 'grad_norm': 3.2721757888793945, 'learning_rate': 9.28693929096961e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8137, 'grad_norm': 1.8597596883773804, 'learning_rate': 9.258261951624581e-07}[Rank 0] Trainer log: {'loss': 0.8137, 'grad_norm': 1.8597596883773804, 'learning_rate': 9.258261951624581e-07}[Rank 1] Trainer log: {'loss': 0.8137, 'grad_norm': 1.8597596883773804, 'learning_rate': 9.258261951624581e-07} [Rank 2] Trainer log: {'loss': 0.8137, 'grad_norm': 1.8597596883773804, 'learning_rate': 9.258261951624581e-07} {'loss': 0.8137, 'grad_norm': 1.8597596883773804, 'learning_rate': 9.258261951624581e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.9277, 'grad_norm': 5.762110233306885, 'learning_rate': 9.229626808193459e-07}[Rank 0] Trainer log: {'loss': 0.9277, 'grad_norm': 5.762110233306885, 'learning_rate': 9.229626808193459e-07}[Rank 2] Trainer log: {'loss': 0.9277, 'grad_norm': 5.762110233306885, 'learning_rate': 9.229626808193459e-07} [Rank 1] Trainer log: {'loss': 0.9277, 'grad_norm': 5.762110233306885, 'learning_rate': 9.229626808193459e-07} {'loss': 0.9277, 'grad_norm': 5.762110233306885, 'learning_rate': 9.229626808193459e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.7457, 'grad_norm': 11.363520622253418, 'learning_rate': 9.201033873991883e-07}[Rank 1] Trainer log: {'loss': 0.7457, 'grad_norm': 11.363520622253418, 'learning_rate': 9.201033873991883e-07}[Rank 3] Trainer log: {'loss': 0.7457, 'grad_norm': 11.363520622253418, 'learning_rate': 9.201033873991883e-07} [Rank 0] Trainer log: {'loss': 0.7457, 'grad_norm': 11.363520622253418, 'learning_rate': 9.201033873991883e-07} {'loss': 0.7457, 'grad_norm': 11.363520622253418, 'learning_rate': 9.201033873991883e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8845, 'grad_norm': 2.837437629699707, 'learning_rate': 9.172483162315915e-07}[Rank 1] Trainer log: {'loss': 0.8845, 'grad_norm': 2.837437629699707, 'learning_rate': 9.172483162315915e-07}[Rank 2] Trainer log: {'loss': 0.8845, 'grad_norm': 2.837437629699707, 'learning_rate': 9.172483162315915e-07} [Rank 0] Trainer log: {'loss': 0.8845, 'grad_norm': 2.837437629699707, 'learning_rate': 9.172483162315915e-07} {'loss': 0.8845, 'grad_norm': 2.837437629699707, 'learning_rate': 9.172483162315915e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.7676, 'grad_norm': 3.0638561248779297, 'learning_rate': 9.143974686441925e-07} [Rank 1] Trainer log: {'loss': 0.7676, 'grad_norm': 3.0638561248779297, 'learning_rate': 9.143974686441925e-07} [Rank 2] Trainer log: {'loss': 0.7676, 'grad_norm': 3.0638561248779297, 'learning_rate': 9.143974686441925e-07} [Rank 0] Trainer log: {'loss': 0.7676, 'grad_norm': 3.0638561248779297, 'learning_rate': 9.143974686441925e-07} {'loss': 0.7676, 'grad_norm': 3.0638561248779297, 'learning_rate': 9.143974686441925e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.7342, 'grad_norm': 3.7212886810302734, 'learning_rate': 9.115508459626654e-07} [Rank 2] Trainer log: {'loss': 0.7342, 'grad_norm': 3.7212886810302734, 'learning_rate': 9.115508459626654e-07} [Rank 1] Trainer log: {'loss': 0.7342, 'grad_norm': 3.7212886810302734, 'learning_rate': 9.115508459626654e-07}[Rank 0] Trainer log: {'loss': 0.7342, 'grad_norm': 3.7212886810302734, 'learning_rate': 9.115508459626654e-07} {'loss': 0.7342, 'grad_norm': 3.7212886810302734, 'learning_rate': 9.115508459626654e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8225, 'grad_norm': 5.916518211364746, 'learning_rate': 9.087084495107256e-07}[Rank 0] Trainer log: {'loss': 0.8225, 'grad_norm': 5.916518211364746, 'learning_rate': 9.087084495107256e-07} [Rank 1] Trainer log: {'loss': 0.8225, 'grad_norm': 5.916518211364746, 'learning_rate': 9.087084495107256e-07} [Rank 2] Trainer log: {'loss': 0.8225, 'grad_norm': 5.916518211364746, 'learning_rate': 9.087084495107256e-07} {'loss': 0.8225, 'grad_norm': 5.916518211364746, 'learning_rate': 9.087084495107256e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.9142, 'grad_norm': 6.626760959625244, 'learning_rate': 9.058702806101172e-07} [Rank 2] Trainer log: {'loss': 0.9142, 'grad_norm': 6.626760959625244, 'learning_rate': 9.058702806101172e-07}[Rank 0] Trainer log: {'loss': 0.9142, 'grad_norm': 6.626760959625244, 'learning_rate': 9.058702806101172e-07} [Rank 1] Trainer log: {'loss': 0.9142, 'grad_norm': 6.626760959625244, 'learning_rate': 9.058702806101172e-07} {'loss': 0.9142, 'grad_norm': 6.626760959625244, 'learning_rate': 9.058702806101172e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.7442, 'grad_norm': 8.525958061218262, 'learning_rate': 9.030363405806176e-07}[Rank 3] Trainer log: {'loss': 0.7442, 'grad_norm': 8.525958061218262, 'learning_rate': 9.030363405806176e-07}[Rank 1] Trainer log: {'loss': 0.7442, 'grad_norm': 8.525958061218262, 'learning_rate': 9.030363405806176e-07} [Rank 0] Trainer log: {'loss': 0.7442, 'grad_norm': 8.525958061218262, 'learning_rate': 9.030363405806176e-07} {'loss': 0.7442, 'grad_norm': 8.525958061218262, 'learning_rate': 9.030363405806176e-07, 'epoch': 0.87} [Rank 1] Trainer log: {'loss': 0.7892, 'grad_norm': 3.8097012042999268, 'learning_rate': 9.002066307400437e-07}[Rank 0] Trainer log: {'loss': 0.7892, 'grad_norm': 3.8097012042999268, 'learning_rate': 9.002066307400437e-07} [Rank 3] Trainer log: {'loss': 0.7892, 'grad_norm': 3.8097012042999268, 'learning_rate': 9.002066307400437e-07} [Rank 2] Trainer log: {'loss': 0.7892, 'grad_norm': 3.8097012042999268, 'learning_rate': 9.002066307400437e-07} {'loss': 0.7892, 'grad_norm': 3.8097012042999268, 'learning_rate': 9.002066307400437e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8141, 'grad_norm': 8.946639060974121, 'learning_rate': 8.973811524042419e-07}[Rank 2] Trainer log: {'loss': 0.8141, 'grad_norm': 8.946639060974121, 'learning_rate': 8.973811524042419e-07} [Rank 0] Trainer log: {'loss': 0.8141, 'grad_norm': 8.946639060974121, 'learning_rate': 8.973811524042419e-07}[Rank 1] Trainer log: {'loss': 0.8141, 'grad_norm': 8.946639060974121, 'learning_rate': 8.973811524042419e-07} {'loss': 0.8141, 'grad_norm': 8.946639060974121, 'learning_rate': 8.973811524042419e-07, 'epoch': 0.87} [Rank 1] Trainer log: {'loss': 0.7717, 'grad_norm': 1.9454610347747803, 'learning_rate': 8.945599068870881e-07}[Rank 2] Trainer log: {'loss': 0.7717, 'grad_norm': 1.9454610347747803, 'learning_rate': 8.945599068870881e-07}[Rank 0] Trainer log: {'loss': 0.7717, 'grad_norm': 1.9454610347747803, 'learning_rate': 8.945599068870881e-07} [Rank 3] Trainer log: {'loss': 0.7717, 'grad_norm': 1.9454610347747803, 'learning_rate': 8.945599068870881e-07} {'loss': 0.7717, 'grad_norm': 1.9454610347747803, 'learning_rate': 8.945599068870881e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.8336, 'grad_norm': 2.984185218811035, 'learning_rate': 8.917428955004958e-07}[Rank 3] Trainer log: {'loss': 0.8336, 'grad_norm': 2.984185218811035, 'learning_rate': 8.917428955004958e-07}[Rank 1] Trainer log: {'loss': 0.8336, 'grad_norm': 2.984185218811035, 'learning_rate': 8.917428955004958e-07} [Rank 0] Trainer log: {'loss': 0.8336, 'grad_norm': 2.984185218811035, 'learning_rate': 8.917428955004958e-07} {'loss': 0.8336, 'grad_norm': 2.984185218811035, 'learning_rate': 8.917428955004958e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 0.8673, 'grad_norm': 3.0681819915771484, 'learning_rate': 8.889301195544054e-07}[Rank 1] Trainer log: {'loss': 0.8673, 'grad_norm': 3.0681819915771484, 'learning_rate': 8.889301195544054e-07} [Rank 2] Trainer log: {'loss': 0.8673, 'grad_norm': 3.0681819915771484, 'learning_rate': 8.889301195544054e-07} [Rank 3] Trainer log: {'loss': 0.8673, 'grad_norm': 3.0681819915771484, 'learning_rate': 8.889301195544054e-07} {'loss': 0.8673, 'grad_norm': 3.0681819915771484, 'learning_rate': 8.889301195544054e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.8838, 'grad_norm': 3.1226351261138916, 'learning_rate': 8.861215803567869e-07}[Rank 0] Trainer log: {'loss': 0.8838, 'grad_norm': 3.1226351261138916, 'learning_rate': 8.861215803567869e-07}[Rank 3] Trainer log: {'loss': 0.8838, 'grad_norm': 3.1226351261138916, 'learning_rate': 8.861215803567869e-07} {'loss': 0.8838, 'grad_norm': 3.1226351261138916, 'learning_rate': 8.861215803567869e-07, 'epoch': 0.87} [Rank 1] Trainer log: {'loss': 0.8838, 'grad_norm': 3.1226351261138916, 'learning_rate': 8.861215803567869e-07} [Rank 3] Trainer log: {'loss': 0.8286, 'grad_norm': 5.856039524078369, 'learning_rate': 8.833172792136469e-07}[Rank 2] Trainer log: {'loss': 0.8286, 'grad_norm': 5.856039524078369, 'learning_rate': 8.833172792136469e-07}[Rank 0] Trainer log: {'loss': 0.8286, 'grad_norm': 5.856039524078369, 'learning_rate': 8.833172792136469e-07} [Rank 1] Trainer log: {'loss': 0.8286, 'grad_norm': 5.856039524078369, 'learning_rate': 8.833172792136469e-07} {'loss': 0.8286, 'grad_norm': 5.856039524078369, 'learning_rate': 8.833172792136469e-07, 'epoch': 0.87} [Rank 1] Trainer log: {'loss': 0.6924, 'grad_norm': 4.734797954559326, 'learning_rate': 8.805172174290122e-07}[Rank 3] Trainer log: {'loss': 0.6924, 'grad_norm': 4.734797954559326, 'learning_rate': 8.805172174290122e-07} [Rank 0] Trainer log: {'loss': 0.6924, 'grad_norm': 4.734797954559326, 'learning_rate': 8.805172174290122e-07} [Rank 2] Trainer log: {'loss': 0.6924, 'grad_norm': 4.734797954559326, 'learning_rate': 8.805172174290122e-07} {'loss': 0.6924, 'grad_norm': 4.734797954559326, 'learning_rate': 8.805172174290122e-07, 'epoch': 0.87} [Rank 1] Trainer log: {'loss': 0.835, 'grad_norm': 5.879898548126221, 'learning_rate': 8.777213963049424e-07}[Rank 0] Trainer log: {'loss': 0.835, 'grad_norm': 5.879898548126221, 'learning_rate': 8.777213963049424e-07}[Rank 3] Trainer log: {'loss': 0.835, 'grad_norm': 5.879898548126221, 'learning_rate': 8.777213963049424e-07} [Rank 2] Trainer log: {'loss': 0.835, 'grad_norm': 5.879898548126221, 'learning_rate': 8.777213963049424e-07} {'loss': 0.835, 'grad_norm': 5.879898548126221, 'learning_rate': 8.777213963049424e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 0.8554, 'grad_norm': 4.184335231781006, 'learning_rate': 8.749298171415277e-07}[Rank 1] Trainer log: {'loss': 0.8554, 'grad_norm': 4.184335231781006, 'learning_rate': 8.749298171415277e-07} [Rank 3] Trainer log: {'loss': 0.8554, 'grad_norm': 4.184335231781006, 'learning_rate': 8.749298171415277e-07}[Rank 2] Trainer log: {'loss': 0.8554, 'grad_norm': 4.184335231781006, 'learning_rate': 8.749298171415277e-07} {'loss': 0.8554, 'grad_norm': 4.184335231781006, 'learning_rate': 8.749298171415277e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.8652, 'grad_norm': 2.7682223320007324, 'learning_rate': 8.721424812368828e-07} [Rank 0] Trainer log: {'loss': 0.8652, 'grad_norm': 2.7682223320007324, 'learning_rate': 8.721424812368828e-07}[Rank 3] Trainer log: {'loss': 0.8652, 'grad_norm': 2.7682223320007324, 'learning_rate': 8.721424812368828e-07} [Rank 1] Trainer log: {'loss': 0.8652, 'grad_norm': 2.7682223320007324, 'learning_rate': 8.721424812368828e-07} {'loss': 0.8652, 'grad_norm': 2.7682223320007324, 'learning_rate': 8.721424812368828e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.6915, 'grad_norm': 3.6142473220825195, 'learning_rate': 8.693593898871488e-07}[Rank 1] Trainer log: {'loss': 0.6915, 'grad_norm': 3.6142473220825195, 'learning_rate': 8.693593898871488e-07} [Rank 2] Trainer log: {'loss': 0.6915, 'grad_norm': 3.6142473220825195, 'learning_rate': 8.693593898871488e-07} [Rank 0] Trainer log: {'loss': 0.6915, 'grad_norm': 3.6142473220825195, 'learning_rate': 8.693593898871488e-07} {'loss': 0.6915, 'grad_norm': 3.6142473220825195, 'learning_rate': 8.693593898871488e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.8256, 'grad_norm': 4.279139518737793, 'learning_rate': 8.665805443864961e-07}[Rank 0] Trainer log: {'loss': 0.8256, 'grad_norm': 4.279139518737793, 'learning_rate': 8.665805443864961e-07} [Rank 1] Trainer log: {'loss': 0.8256, 'grad_norm': 4.279139518737793, 'learning_rate': 8.665805443864961e-07} [Rank 3] Trainer log: {'loss': 0.8256, 'grad_norm': 4.279139518737793, 'learning_rate': 8.665805443864961e-07} {'loss': 0.8256, 'grad_norm': 4.279139518737793, 'learning_rate': 8.665805443864961e-07, 'epoch': 0.87} [Rank 1] Trainer log: {'loss': 0.9244, 'grad_norm': 3.429701566696167, 'learning_rate': 8.638059460271175e-07}[Rank 3] Trainer log: {'loss': 0.9244, 'grad_norm': 3.429701566696167, 'learning_rate': 8.638059460271175e-07}[Rank 0] Trainer log: {'loss': 0.9244, 'grad_norm': 3.429701566696167, 'learning_rate': 8.638059460271175e-07} {'loss': 0.9244, 'grad_norm': 3.429701566696167, 'learning_rate': 8.638059460271175e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.9244, 'grad_norm': 3.429701566696167, 'learning_rate': 8.638059460271175e-07} [Rank 0] Trainer log: {'loss': 0.7905, 'grad_norm': 7.728419303894043, 'learning_rate': 8.610355960992323e-07}[Rank 3] Trainer log: {'loss': 0.7905, 'grad_norm': 7.728419303894043, 'learning_rate': 8.610355960992323e-07} [Rank 2] Trainer log: {'loss': 0.7905, 'grad_norm': 7.728419303894043, 'learning_rate': 8.610355960992323e-07} [Rank 1] Trainer log: {'loss': 0.7905, 'grad_norm': 7.728419303894043, 'learning_rate': 8.610355960992323e-07} {'loss': 0.7905, 'grad_norm': 7.728419303894043, 'learning_rate': 8.610355960992323e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.7412, 'grad_norm': 7.467408180236816, 'learning_rate': 8.582694958910809e-07} [Rank 1] Trainer log: {'loss': 0.7412, 'grad_norm': 7.467408180236816, 'learning_rate': 8.582694958910809e-07} [Rank 0] Trainer log: {'loss': 0.7412, 'grad_norm': 7.467408180236816, 'learning_rate': 8.582694958910809e-07} [Rank 3] Trainer log: {'loss': 0.7412, 'grad_norm': 7.467408180236816, 'learning_rate': 8.582694958910809e-07} {'loss': 0.7412, 'grad_norm': 7.467408180236816, 'learning_rate': 8.582694958910809e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 1.0262, 'grad_norm': 2.6239817142486572, 'learning_rate': 8.555076466889345e-07}[Rank 2] Trainer log: {'loss': 1.0262, 'grad_norm': 2.6239817142486572, 'learning_rate': 8.555076466889345e-07}[Rank 3] Trainer log: {'loss': 1.0262, 'grad_norm': 2.6239817142486572, 'learning_rate': 8.555076466889345e-07} [Rank 1] Trainer log: {'loss': 1.0262, 'grad_norm': 2.6239817142486572, 'learning_rate': 8.555076466889345e-07} {'loss': 1.0262, 'grad_norm': 2.6239817142486572, 'learning_rate': 8.555076466889345e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 1.049, 'grad_norm': 2.7903659343719482, 'learning_rate': 8.527500497770813e-07}[Rank 1] Trainer log: {'loss': 1.049, 'grad_norm': 2.7903659343719482, 'learning_rate': 8.527500497770813e-07} [Rank 0] Trainer log: {'loss': 1.049, 'grad_norm': 2.7903659343719482, 'learning_rate': 8.527500497770813e-07}[Rank 2] Trainer log: {'loss': 1.049, 'grad_norm': 2.7903659343719482, 'learning_rate': 8.527500497770813e-07} {'loss': 1.049, 'grad_norm': 2.7903659343719482, 'learning_rate': 8.527500497770813e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.8738, 'grad_norm': 8.863433837890625, 'learning_rate': 8.499967064378345e-07} [Rank 0] Trainer log: {'loss': 0.8738, 'grad_norm': 8.863433837890625, 'learning_rate': 8.499967064378345e-07}[Rank 2] Trainer log: {'loss': 0.8738, 'grad_norm': 8.863433837890625, 'learning_rate': 8.499967064378345e-07} [Rank 1] Trainer log: {'loss': 0.8738, 'grad_norm': 8.863433837890625, 'learning_rate': 8.499967064378345e-07} {'loss': 0.8738, 'grad_norm': 8.863433837890625, 'learning_rate': 8.499967064378345e-07, 'epoch': 0.87} [Rank 2] Trainer log: {'loss': 0.8186, 'grad_norm': 3.987638473510742, 'learning_rate': 8.472476179515321e-07}[Rank 1] Trainer log: {'loss': 0.8186, 'grad_norm': 3.987638473510742, 'learning_rate': 8.472476179515321e-07}[Rank 3] Trainer log: {'loss': 0.8186, 'grad_norm': 3.987638473510742, 'learning_rate': 8.472476179515321e-07} [Rank 0] Trainer log: {'loss': 0.8186, 'grad_norm': 3.987638473510742, 'learning_rate': 8.472476179515321e-07} {'loss': 0.8186, 'grad_norm': 3.987638473510742, 'learning_rate': 8.472476179515321e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 0.9156, 'grad_norm': 3.3163511753082275, 'learning_rate': 8.445027855965282e-07}[Rank 1] Trainer log: {'loss': 0.9156, 'grad_norm': 3.3163511753082275, 'learning_rate': 8.445027855965282e-07} [Rank 3] Trainer log: {'loss': 0.9156, 'grad_norm': 3.3163511753082275, 'learning_rate': 8.445027855965282e-07}[Rank 2] Trainer log: {'loss': 0.9156, 'grad_norm': 3.3163511753082275, 'learning_rate': 8.445027855965282e-07} {'loss': 0.9156, 'grad_norm': 3.3163511753082275, 'learning_rate': 8.445027855965282e-07, 'epoch': 0.87} [Rank 0] Trainer log: {'loss': 0.8705, 'grad_norm': 7.077831268310547, 'learning_rate': 8.417622106491996e-07}[Rank 3] Trainer log: {'loss': 0.8705, 'grad_norm': 7.077831268310547, 'learning_rate': 8.417622106491996e-07} [Rank 1] Trainer log: {'loss': 0.8705, 'grad_norm': 7.077831268310547, 'learning_rate': 8.417622106491996e-07} [Rank 2] Trainer log: {'loss': 0.8705, 'grad_norm': 7.077831268310547, 'learning_rate': 8.417622106491996e-07} {'loss': 0.8705, 'grad_norm': 7.077831268310547, 'learning_rate': 8.417622106491996e-07, 'epoch': 0.87} [Rank 3] Trainer log: {'loss': 0.9666, 'grad_norm': 2.873533248901367, 'learning_rate': 8.390258943839458e-07}[Rank 1] Trainer log: {'loss': 0.9666, 'grad_norm': 2.873533248901367, 'learning_rate': 8.390258943839458e-07} [Rank 0] Trainer log: {'loss': 0.9666, 'grad_norm': 2.873533248901367, 'learning_rate': 8.390258943839458e-07} [Rank 2] Trainer log: {'loss': 0.9666, 'grad_norm': 2.873533248901367, 'learning_rate': 8.390258943839458e-07} {'loss': 0.9666, 'grad_norm': 2.873533248901367, 'learning_rate': 8.390258943839458e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.9242, 'grad_norm': 3.6381335258483887, 'learning_rate': 8.362938380731844e-07}[Rank 3] Trainer log: {'loss': 0.9242, 'grad_norm': 3.6381335258483887, 'learning_rate': 8.362938380731844e-07} [Rank 2] Trainer log: {'loss': 0.9242, 'grad_norm': 3.6381335258483887, 'learning_rate': 8.362938380731844e-07} [Rank 1] Trainer log: {'loss': 0.9242, 'grad_norm': 3.6381335258483887, 'learning_rate': 8.362938380731844e-07} {'loss': 0.9242, 'grad_norm': 3.6381335258483887, 'learning_rate': 8.362938380731844e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.8408, 'grad_norm': 11.246790885925293, 'learning_rate': 8.335660429873494e-07}[Rank 1] Trainer log: {'loss': 0.8408, 'grad_norm': 11.246790885925293, 'learning_rate': 8.335660429873494e-07} [Rank 3] Trainer log: {'loss': 0.8408, 'grad_norm': 11.246790885925293, 'learning_rate': 8.335660429873494e-07} [Rank 2] Trainer log: {'loss': 0.8408, 'grad_norm': 11.246790885925293, 'learning_rate': 8.335660429873494e-07} {'loss': 0.8408, 'grad_norm': 11.246790885925293, 'learning_rate': 8.335660429873494e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 1.0635, 'grad_norm': 2.799032211303711, 'learning_rate': 8.308425103948991e-07} [Rank 0] Trainer log: {'loss': 1.0635, 'grad_norm': 2.799032211303711, 'learning_rate': 8.308425103948991e-07}[Rank 1] Trainer log: {'loss': 1.0635, 'grad_norm': 2.799032211303711, 'learning_rate': 8.308425103948991e-07} [Rank 2] Trainer log: {'loss': 1.0635, 'grad_norm': 2.799032211303711, 'learning_rate': 8.308425103948991e-07} {'loss': 1.0635, 'grad_norm': 2.799032211303711, 'learning_rate': 8.308425103948991e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.4906, 'grad_norm': 15.93394660949707, 'learning_rate': 8.28123241562302e-07} [Rank 0] Trainer log: {'loss': 0.4906, 'grad_norm': 15.93394660949707, 'learning_rate': 8.28123241562302e-07}[Rank 3] Trainer log: {'loss': 0.4906, 'grad_norm': 15.93394660949707, 'learning_rate': 8.28123241562302e-07} [Rank 2] Trainer log: {'loss': 0.4906, 'grad_norm': 15.93394660949707, 'learning_rate': 8.28123241562302e-07} {'loss': 0.4906, 'grad_norm': 15.93394660949707, 'learning_rate': 8.28123241562302e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.9243, 'grad_norm': 3.2028908729553223, 'learning_rate': 8.254082377540506e-07} [Rank 3] Trainer log: {'loss': 0.9243, 'grad_norm': 3.2028908729553223, 'learning_rate': 8.254082377540506e-07}[Rank 0] Trainer log: {'loss': 0.9243, 'grad_norm': 3.2028908729553223, 'learning_rate': 8.254082377540506e-07} [Rank 1] Trainer log: {'loss': 0.9243, 'grad_norm': 3.2028908729553223, 'learning_rate': 8.254082377540506e-07} {'loss': 0.9243, 'grad_norm': 3.2028908729553223, 'learning_rate': 8.254082377540506e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.6969, 'grad_norm': 7.63700532913208, 'learning_rate': 8.226975002326554e-07}[Rank 1] Trainer log: {'loss': 0.6969, 'grad_norm': 7.63700532913208, 'learning_rate': 8.226975002326554e-07}[Rank 3] Trainer log: {'loss': 0.6969, 'grad_norm': 7.63700532913208, 'learning_rate': 8.226975002326554e-07} [Rank 0] Trainer log: {'loss': 0.6969, 'grad_norm': 7.63700532913208, 'learning_rate': 8.226975002326554e-07} {'loss': 0.6969, 'grad_norm': 7.63700532913208, 'learning_rate': 8.226975002326554e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.9462, 'grad_norm': 4.232856273651123, 'learning_rate': 8.199910302586345e-07}[Rank 3] Trainer log: {'loss': 0.9462, 'grad_norm': 4.232856273651123, 'learning_rate': 8.199910302586345e-07}[Rank 1] Trainer log: {'loss': 0.9462, 'grad_norm': 4.232856273651123, 'learning_rate': 8.199910302586345e-07} [Rank 2] Trainer log: {'loss': 0.9462, 'grad_norm': 4.232856273651123, 'learning_rate': 8.199910302586345e-07} {'loss': 0.9462, 'grad_norm': 4.232856273651123, 'learning_rate': 8.199910302586345e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.8432, 'grad_norm': 1.5943574905395508, 'learning_rate': 8.172888290905268e-07}[Rank 2] Trainer log: {'loss': 0.8432, 'grad_norm': 1.5943574905395508, 'learning_rate': 8.172888290905268e-07}[Rank 3] Trainer log: {'loss': 0.8432, 'grad_norm': 1.5943574905395508, 'learning_rate': 8.172888290905268e-07} [Rank 0] Trainer log: {'loss': 0.8432, 'grad_norm': 1.5943574905395508, 'learning_rate': 8.172888290905268e-07} {'loss': 0.8432, 'grad_norm': 1.5943574905395508, 'learning_rate': 8.172888290905268e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.8872, 'grad_norm': 5.081014633178711, 'learning_rate': 8.145908979848882e-07}[Rank 2] Trainer log: {'loss': 0.8872, 'grad_norm': 5.081014633178711, 'learning_rate': 8.145908979848882e-07}[Rank 1] Trainer log: {'loss': 0.8872, 'grad_norm': 5.081014633178711, 'learning_rate': 8.145908979848882e-07} [Rank 0] Trainer log: {'loss': 0.8872, 'grad_norm': 5.081014633178711, 'learning_rate': 8.145908979848882e-07} {'loss': 0.8872, 'grad_norm': 5.081014633178711, 'learning_rate': 8.145908979848882e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.8176, 'grad_norm': 4.383447170257568, 'learning_rate': 8.118972381962853e-07}[Rank 2] Trainer log: {'loss': 0.8176, 'grad_norm': 4.383447170257568, 'learning_rate': 8.118972381962853e-07} [Rank 3] Trainer log: {'loss': 0.8176, 'grad_norm': 4.383447170257568, 'learning_rate': 8.118972381962853e-07} [Rank 0] Trainer log: {'loss': 0.8176, 'grad_norm': 4.383447170257568, 'learning_rate': 8.118972381962853e-07} {'loss': 0.8176, 'grad_norm': 4.383447170257568, 'learning_rate': 8.118972381962853e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.8076, 'grad_norm': 3.5977730751037598, 'learning_rate': 8.092078509772983e-07}[Rank 1] Trainer log: {'loss': 0.8076, 'grad_norm': 3.5977730751037598, 'learning_rate': 8.092078509772983e-07} [Rank 0] Trainer log: {'loss': 0.8076, 'grad_norm': 3.5977730751037598, 'learning_rate': 8.092078509772983e-07}[Rank 2] Trainer log: {'loss': 0.8076, 'grad_norm': 3.5977730751037598, 'learning_rate': 8.092078509772983e-07} {'loss': 0.8076, 'grad_norm': 3.5977730751037598, 'learning_rate': 8.092078509772983e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.9659, 'grad_norm': 2.694952964782715, 'learning_rate': 8.065227375785245e-07}[Rank 3] Trainer log: {'loss': 0.9659, 'grad_norm': 2.694952964782715, 'learning_rate': 8.065227375785245e-07} [Rank 2] Trainer log: {'loss': 0.9659, 'grad_norm': 2.694952964782715, 'learning_rate': 8.065227375785245e-07}[Rank 1] Trainer log: {'loss': 0.9659, 'grad_norm': 2.694952964782715, 'learning_rate': 8.065227375785245e-07} {'loss': 0.9659, 'grad_norm': 2.694952964782715, 'learning_rate': 8.065227375785245e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.8979, 'grad_norm': 3.2324934005737305, 'learning_rate': 8.038418992485697e-07}[Rank 3] Trainer log: {'loss': 0.8979, 'grad_norm': 3.2324934005737305, 'learning_rate': 8.038418992485697e-07}[Rank 2] Trainer log: {'loss': 0.8979, 'grad_norm': 3.2324934005737305, 'learning_rate': 8.038418992485697e-07} [Rank 0] Trainer log: {'loss': 0.8979, 'grad_norm': 3.2324934005737305, 'learning_rate': 8.038418992485697e-07} {'loss': 0.8979, 'grad_norm': 3.2324934005737305, 'learning_rate': 8.038418992485697e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 1.0647, 'grad_norm': 3.106532335281372, 'learning_rate': 8.011653372340533e-07}[Rank 2] Trainer log: {'loss': 1.0647, 'grad_norm': 3.106532335281372, 'learning_rate': 8.011653372340533e-07} [Rank 1] Trainer log: {'loss': 1.0647, 'grad_norm': 3.106532335281372, 'learning_rate': 8.011653372340533e-07} {'loss': 1.0647, 'grad_norm': 3.106532335281372, 'learning_rate': 8.011653372340533e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 1.0647, 'grad_norm': 3.106532335281372, 'learning_rate': 8.011653372340533e-07} [Rank 3] Trainer log: {'loss': 0.553, 'grad_norm': 21.30074119567871, 'learning_rate': 7.984930527796087e-07} [Rank 2] Trainer log: {'loss': 0.553, 'grad_norm': 21.30074119567871, 'learning_rate': 7.984930527796087e-07} [Rank 0] Trainer log: {'loss': 0.553, 'grad_norm': 21.30074119567871, 'learning_rate': 7.984930527796087e-07} [Rank 1] Trainer log: {'loss': 0.553, 'grad_norm': 21.30074119567871, 'learning_rate': 7.984930527796087e-07} {'loss': 0.553, 'grad_norm': 21.30074119567871, 'learning_rate': 7.984930527796087e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.7825, 'grad_norm': 4.332063674926758, 'learning_rate': 7.958250471278772e-07}[Rank 0] Trainer log: {'loss': 0.7825, 'grad_norm': 4.332063674926758, 'learning_rate': 7.958250471278772e-07} [Rank 2] Trainer log: {'loss': 0.7825, 'grad_norm': 4.332063674926758, 'learning_rate': 7.958250471278772e-07}[Rank 1] Trainer log: {'loss': 0.7825, 'grad_norm': 4.332063674926758, 'learning_rate': 7.958250471278772e-07} {'loss': 0.7825, 'grad_norm': 4.332063674926758, 'learning_rate': 7.958250471278772e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.9001, 'grad_norm': 4.280857086181641, 'learning_rate': 7.931613215195089e-07}[Rank 3] Trainer log: {'loss': 0.9001, 'grad_norm': 4.280857086181641, 'learning_rate': 7.931613215195089e-07} [Rank 1] Trainer log: {'loss': 0.9001, 'grad_norm': 4.280857086181641, 'learning_rate': 7.931613215195089e-07} [Rank 2] Trainer log: {'loss': 0.9001, 'grad_norm': 4.280857086181641, 'learning_rate': 7.931613215195089e-07} {'loss': 0.9001, 'grad_norm': 4.280857086181641, 'learning_rate': 7.931613215195089e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.81, 'grad_norm': 3.595587968826294, 'learning_rate': 7.905018771931661e-07} [Rank 0] Trainer log: {'loss': 0.81, 'grad_norm': 3.595587968826294, 'learning_rate': 7.905018771931661e-07}[Rank 1] Trainer log: {'loss': 0.81, 'grad_norm': 3.595587968826294, 'learning_rate': 7.905018771931661e-07} [Rank 2] Trainer log: {'loss': 0.81, 'grad_norm': 3.595587968826294, 'learning_rate': 7.905018771931661e-07} {'loss': 0.81, 'grad_norm': 3.595587968826294, 'learning_rate': 7.905018771931661e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 1.0032, 'grad_norm': 2.3953540325164795, 'learning_rate': 7.878467153855218e-07} [Rank 0] Trainer log: {'loss': 1.0032, 'grad_norm': 2.3953540325164795, 'learning_rate': 7.878467153855218e-07}[Rank 3] Trainer log: {'loss': 1.0032, 'grad_norm': 2.3953540325164795, 'learning_rate': 7.878467153855218e-07} [Rank 1] Trainer log: {'loss': 1.0032, 'grad_norm': 2.3953540325164795, 'learning_rate': 7.878467153855218e-07} {'loss': 1.0032, 'grad_norm': 2.3953540325164795, 'learning_rate': 7.878467153855218e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.9025, 'grad_norm': 4.539964199066162, 'learning_rate': 7.851958373312529e-07}[Rank 3] Trainer log: {'loss': 0.9025, 'grad_norm': 4.539964199066162, 'learning_rate': 7.851958373312529e-07} [Rank 0] Trainer log: {'loss': 0.9025, 'grad_norm': 4.539964199066162, 'learning_rate': 7.851958373312529e-07} [Rank 1] Trainer log: {'loss': 0.9025, 'grad_norm': 4.539964199066162, 'learning_rate': 7.851958373312529e-07} {'loss': 0.9025, 'grad_norm': 4.539964199066162, 'learning_rate': 7.851958373312529e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.8888, 'grad_norm': 6.496451377868652, 'learning_rate': 7.825492442630478e-07} [Rank 2] Trainer log: {'loss': 0.8888, 'grad_norm': 6.496451377868652, 'learning_rate': 7.825492442630478e-07}[Rank 1] Trainer log: {'loss': 0.8888, 'grad_norm': 6.496451377868652, 'learning_rate': 7.825492442630478e-07} [Rank 0] Trainer log: {'loss': 0.8888, 'grad_norm': 6.496451377868652, 'learning_rate': 7.825492442630478e-07} {'loss': 0.8888, 'grad_norm': 6.496451377868652, 'learning_rate': 7.825492442630478e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.9439, 'grad_norm': 2.9328339099884033, 'learning_rate': 7.799069374116053e-07}[Rank 3] Trainer log: {'loss': 0.9439, 'grad_norm': 2.9328339099884033, 'learning_rate': 7.799069374116053e-07}[Rank 0] Trainer log: {'loss': 0.9439, 'grad_norm': 2.9328339099884033, 'learning_rate': 7.799069374116053e-07} [Rank 1] Trainer log: {'loss': 0.9439, 'grad_norm': 2.9328339099884033, 'learning_rate': 7.799069374116053e-07} {'loss': 0.9439, 'grad_norm': 2.9328339099884033, 'learning_rate': 7.799069374116053e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.691, 'grad_norm': 2.2826266288757324, 'learning_rate': 7.772689180056236e-07}[Rank 0] Trainer log: {'loss': 0.691, 'grad_norm': 2.2826266288757324, 'learning_rate': 7.772689180056236e-07}[Rank 3] Trainer log: {'loss': 0.691, 'grad_norm': 2.2826266288757324, 'learning_rate': 7.772689180056236e-07} [Rank 1] Trainer log: {'loss': 0.691, 'grad_norm': 2.2826266288757324, 'learning_rate': 7.772689180056236e-07} {'loss': 0.691, 'grad_norm': 2.2826266288757324, 'learning_rate': 7.772689180056236e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 1.0395, 'grad_norm': 2.625298500061035, 'learning_rate': 7.746351872718106e-07}[Rank 0] Trainer log: {'loss': 1.0395, 'grad_norm': 2.625298500061035, 'learning_rate': 7.746351872718106e-07}[Rank 1] Trainer log: {'loss': 1.0395, 'grad_norm': 2.625298500061035, 'learning_rate': 7.746351872718106e-07} [Rank 2] Trainer log: {'loss': 1.0395, 'grad_norm': 2.625298500061035, 'learning_rate': 7.746351872718106e-07} {'loss': 1.0395, 'grad_norm': 2.625298500061035, 'learning_rate': 7.746351872718106e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.9328, 'grad_norm': 4.7621660232543945, 'learning_rate': 7.72005746434884e-07} [Rank 0] Trainer log: {'loss': 0.9328, 'grad_norm': 4.7621660232543945, 'learning_rate': 7.72005746434884e-07}[Rank 3] Trainer log: {'loss': 0.9328, 'grad_norm': 4.7621660232543945, 'learning_rate': 7.72005746434884e-07} [Rank 1] Trainer log: {'loss': 0.9328, 'grad_norm': 4.7621660232543945, 'learning_rate': 7.72005746434884e-07} {'loss': 0.9328, 'grad_norm': 4.7621660232543945, 'learning_rate': 7.72005746434884e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.7486, 'grad_norm': 3.9333276748657227, 'learning_rate': 7.693805967175594e-07} [Rank 3] Trainer log: {'loss': 0.7486, 'grad_norm': 3.9333276748657227, 'learning_rate': 7.693805967175594e-07} [Rank 1] Trainer log: {'loss': 0.7486, 'grad_norm': 3.9333276748657227, 'learning_rate': 7.693805967175594e-07} [Rank 0] Trainer log: {'loss': 0.7486, 'grad_norm': 3.9333276748657227, 'learning_rate': 7.693805967175594e-07} {'loss': 0.7486, 'grad_norm': 3.9333276748657227, 'learning_rate': 7.693805967175594e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.7749, 'grad_norm': 3.4899141788482666, 'learning_rate': 7.667597393405602e-07}[Rank 3] Trainer log: {'loss': 0.7749, 'grad_norm': 3.4899141788482666, 'learning_rate': 7.667597393405602e-07}[Rank 0] Trainer log: {'loss': 0.7749, 'grad_norm': 3.4899141788482666, 'learning_rate': 7.667597393405602e-07} [Rank 2] Trainer log: {'loss': 0.7749, 'grad_norm': 3.4899141788482666, 'learning_rate': 7.667597393405602e-07} {'loss': 0.7749, 'grad_norm': 3.4899141788482666, 'learning_rate': 7.667597393405602e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.6067, 'grad_norm': 4.561038970947266, 'learning_rate': 7.641431755226169e-07}[Rank 3] Trainer log: {'loss': 0.6067, 'grad_norm': 4.561038970947266, 'learning_rate': 7.641431755226169e-07}[Rank 1] Trainer log: {'loss': 0.6067, 'grad_norm': 4.561038970947266, 'learning_rate': 7.641431755226169e-07} [Rank 2] Trainer log: {'loss': 0.6067, 'grad_norm': 4.561038970947266, 'learning_rate': 7.641431755226169e-07} {'loss': 0.6067, 'grad_norm': 4.561038970947266, 'learning_rate': 7.641431755226169e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.9776, 'grad_norm': 1.9865657091140747, 'learning_rate': 7.615309064804588e-07} [Rank 3] Trainer log: {'loss': 0.9776, 'grad_norm': 1.9865657091140747, 'learning_rate': 7.615309064804588e-07}[Rank 0] Trainer log: {'loss': 0.9776, 'grad_norm': 1.9865657091140747, 'learning_rate': 7.615309064804588e-07} [Rank 2] Trainer log: {'loss': 0.9776, 'grad_norm': 1.9865657091140747, 'learning_rate': 7.615309064804588e-07} {'loss': 0.9776, 'grad_norm': 1.9865657091140747, 'learning_rate': 7.615309064804588e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.7496, 'grad_norm': 11.034297943115234, 'learning_rate': 7.589229334288196e-07}[Rank 0] Trainer log: {'loss': 0.7496, 'grad_norm': 11.034297943115234, 'learning_rate': 7.589229334288196e-07} [Rank 2] Trainer log: {'loss': 0.7496, 'grad_norm': 11.034297943115234, 'learning_rate': 7.589229334288196e-07} [Rank 1] Trainer log: {'loss': 0.7496, 'grad_norm': 11.034297943115234, 'learning_rate': 7.589229334288196e-07} {'loss': 0.7496, 'grad_norm': 11.034297943115234, 'learning_rate': 7.589229334288196e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.8497, 'grad_norm': 2.354625701904297, 'learning_rate': 7.56319257580439e-07}[Rank 3] Trainer log: {'loss': 0.8497, 'grad_norm': 2.354625701904297, 'learning_rate': 7.56319257580439e-07}[Rank 0] Trainer log: {'loss': 0.8497, 'grad_norm': 2.354625701904297, 'learning_rate': 7.56319257580439e-07} [Rank 2] Trainer log: {'loss': 0.8497, 'grad_norm': 2.354625701904297, 'learning_rate': 7.56319257580439e-07} {'loss': 0.8497, 'grad_norm': 2.354625701904297, 'learning_rate': 7.56319257580439e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.8876, 'grad_norm': 10.458855628967285, 'learning_rate': 7.53719880146051e-07}[Rank 1] Trainer log: {'loss': 0.8876, 'grad_norm': 10.458855628967285, 'learning_rate': 7.53719880146051e-07}[Rank 3] Trainer log: {'loss': 0.8876, 'grad_norm': 10.458855628967285, 'learning_rate': 7.53719880146051e-07} [Rank 2] Trainer log: {'loss': 0.8876, 'grad_norm': 10.458855628967285, 'learning_rate': 7.53719880146051e-07}{'loss': 0.8876, 'grad_norm': 10.458855628967285, 'learning_rate': 7.53719880146051e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.7195, 'grad_norm': 3.060640335083008, 'learning_rate': 7.511248023343964e-07} [Rank 3] Trainer log: {'loss': 0.7195, 'grad_norm': 3.060640335083008, 'learning_rate': 7.511248023343964e-07}[Rank 1] Trainer log: {'loss': 0.7195, 'grad_norm': 3.060640335083008, 'learning_rate': 7.511248023343964e-07} [Rank 0] Trainer log: {'loss': 0.7195, 'grad_norm': 3.060640335083008, 'learning_rate': 7.511248023343964e-07} {'loss': 0.7195, 'grad_norm': 3.060640335083008, 'learning_rate': 7.511248023343964e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.7512, 'grad_norm': 2.736769437789917, 'learning_rate': 7.485340253522166e-07} [Rank 0] Trainer log: {'loss': 0.7512, 'grad_norm': 2.736769437789917, 'learning_rate': 7.485340253522166e-07}[Rank 3] Trainer log: {'loss': 0.7512, 'grad_norm': 2.736769437789917, 'learning_rate': 7.485340253522166e-07} [Rank 2] Trainer log: {'loss': 0.7512, 'grad_norm': 2.736769437789917, 'learning_rate': 7.485340253522166e-07} {'loss': 0.7512, 'grad_norm': 2.736769437789917, 'learning_rate': 7.485340253522166e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.7785, 'grad_norm': 2.932015895843506, 'learning_rate': 7.459475504042501e-07} [Rank 3] Trainer log: {'loss': 0.7785, 'grad_norm': 2.932015895843506, 'learning_rate': 7.459475504042501e-07}[Rank 2] Trainer log: {'loss': 0.7785, 'grad_norm': 2.932015895843506, 'learning_rate': 7.459475504042501e-07} [Rank 0] Trainer log: {'loss': 0.7785, 'grad_norm': 2.932015895843506, 'learning_rate': 7.459475504042501e-07} {'loss': 0.7785, 'grad_norm': 2.932015895843506, 'learning_rate': 7.459475504042501e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 1.0568, 'grad_norm': 2.4101860523223877, 'learning_rate': 7.43365378693236e-07} [Rank 1] Trainer log: {'loss': 1.0568, 'grad_norm': 2.4101860523223877, 'learning_rate': 7.43365378693236e-07}[Rank 0] Trainer log: {'loss': 1.0568, 'grad_norm': 2.4101860523223877, 'learning_rate': 7.43365378693236e-07} [Rank 2] Trainer log: {'loss': 1.0568, 'grad_norm': 2.4101860523223877, 'learning_rate': 7.43365378693236e-07} {'loss': 1.0568, 'grad_norm': 2.4101860523223877, 'learning_rate': 7.43365378693236e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.8818, 'grad_norm': 3.602766752243042, 'learning_rate': 7.407875114199137e-07}[Rank 2] Trainer log: {'loss': 0.8818, 'grad_norm': 3.602766752243042, 'learning_rate': 7.407875114199137e-07} [Rank 3] Trainer log: {'loss': 0.8818, 'grad_norm': 3.602766752243042, 'learning_rate': 7.407875114199137e-07} [Rank 0] Trainer log: {'loss': 0.8818, 'grad_norm': 3.602766752243042, 'learning_rate': 7.407875114199137e-07} {'loss': 0.8818, 'grad_norm': 3.602766752243042, 'learning_rate': 7.407875114199137e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.6792, 'grad_norm': 7.61569881439209, 'learning_rate': 7.382139497830187e-07}[Rank 2] Trainer log: {'loss': 0.6792, 'grad_norm': 7.61569881439209, 'learning_rate': 7.382139497830187e-07} [Rank 1] Trainer log: {'loss': 0.6792, 'grad_norm': 7.61569881439209, 'learning_rate': 7.382139497830187e-07} [Rank 0] Trainer log: {'loss': 0.6792, 'grad_norm': 7.61569881439209, 'learning_rate': 7.382139497830187e-07} {'loss': 0.6792, 'grad_norm': 7.61569881439209, 'learning_rate': 7.382139497830187e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.9553, 'grad_norm': 4.153801441192627, 'learning_rate': 7.356446949792872e-07} [Rank 1] Trainer log: {'loss': 0.9553, 'grad_norm': 4.153801441192627, 'learning_rate': 7.356446949792872e-07} [Rank 2] Trainer log: {'loss': 0.9553, 'grad_norm': 4.153801441192627, 'learning_rate': 7.356446949792872e-07} [Rank 0] Trainer log: {'loss': 0.9553, 'grad_norm': 4.153801441192627, 'learning_rate': 7.356446949792872e-07} {'loss': 0.9553, 'grad_norm': 4.153801441192627, 'learning_rate': 7.356446949792872e-07, 'epoch': 0.88} [Rank 2] Trainer log: {'loss': 0.9004, 'grad_norm': 4.329833030700684, 'learning_rate': 7.330797482034524e-07}[Rank 3] Trainer log: {'loss': 0.9004, 'grad_norm': 4.329833030700684, 'learning_rate': 7.330797482034524e-07} [Rank 0] Trainer log: {'loss': 0.9004, 'grad_norm': 4.329833030700684, 'learning_rate': 7.330797482034524e-07} [Rank 1] Trainer log: {'loss': 0.9004, 'grad_norm': 4.329833030700684, 'learning_rate': 7.330797482034524e-07} {'loss': 0.9004, 'grad_norm': 4.329833030700684, 'learning_rate': 7.330797482034524e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.8579, 'grad_norm': 8.113931655883789, 'learning_rate': 7.305191106482401e-07}[Rank 3] Trainer log: {'loss': 0.8579, 'grad_norm': 8.113931655883789, 'learning_rate': 7.305191106482401e-07} [Rank 1] Trainer log: {'loss': 0.8579, 'grad_norm': 8.113931655883789, 'learning_rate': 7.305191106482401e-07} [Rank 2] Trainer log: {'loss': 0.8579, 'grad_norm': 8.113931655883789, 'learning_rate': 7.305191106482401e-07}{'loss': 0.8579, 'grad_norm': 8.113931655883789, 'learning_rate': 7.305191106482401e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.8946, 'grad_norm': 2.3279144763946533, 'learning_rate': 7.279627835043757e-07}[Rank 3] Trainer log: {'loss': 0.8946, 'grad_norm': 2.3279144763946533, 'learning_rate': 7.279627835043757e-07} [Rank 0] Trainer log: {'loss': 0.8946, 'grad_norm': 2.3279144763946533, 'learning_rate': 7.279627835043757e-07}[Rank 2] Trainer log: {'loss': 0.8946, 'grad_norm': 2.3279144763946533, 'learning_rate': 7.279627835043757e-07} {'loss': 0.8946, 'grad_norm': 2.3279144763946533, 'learning_rate': 7.279627835043757e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.881, 'grad_norm': 4.984705924987793, 'learning_rate': 7.254107679605815e-07}[Rank 0] Trainer log: {'loss': 0.881, 'grad_norm': 4.984705924987793, 'learning_rate': 7.254107679605815e-07}[Rank 3] Trainer log: {'loss': 0.881, 'grad_norm': 4.984705924987793, 'learning_rate': 7.254107679605815e-07} [Rank 2] Trainer log: {'loss': 0.881, 'grad_norm': 4.984705924987793, 'learning_rate': 7.254107679605815e-07} {'loss': 0.881, 'grad_norm': 4.984705924987793, 'learning_rate': 7.254107679605815e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.8984, 'grad_norm': 4.290565013885498, 'learning_rate': 7.228630652035717e-07} [Rank 0] Trainer log: {'loss': 0.8984, 'grad_norm': 4.290565013885498, 'learning_rate': 7.228630652035717e-07}[Rank 1] Trainer log: {'loss': 0.8984, 'grad_norm': 4.290565013885498, 'learning_rate': 7.228630652035717e-07} [Rank 2] Trainer log: {'loss': 0.8984, 'grad_norm': 4.290565013885498, 'learning_rate': 7.228630652035717e-07} {'loss': 0.8984, 'grad_norm': 4.290565013885498, 'learning_rate': 7.228630652035717e-07, 'epoch': 0.88} [Rank 1] Trainer log: {'loss': 0.8031, 'grad_norm': 2.2936086654663086, 'learning_rate': 7.203196764180576e-07}[Rank 0] Trainer log: {'loss': 0.8031, 'grad_norm': 2.2936086654663086, 'learning_rate': 7.203196764180576e-07}[Rank 2] Trainer log: {'loss': 0.8031, 'grad_norm': 2.2936086654663086, 'learning_rate': 7.203196764180576e-07} [Rank 3] Trainer log: {'loss': 0.8031, 'grad_norm': 2.2936086654663086, 'learning_rate': 7.203196764180576e-07} {'loss': 0.8031, 'grad_norm': 2.2936086654663086, 'learning_rate': 7.203196764180576e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.7605, 'grad_norm': 4.523890018463135, 'learning_rate': 7.177806027867395e-07}[Rank 1] Trainer log: {'loss': 0.7605, 'grad_norm': 4.523890018463135, 'learning_rate': 7.177806027867395e-07} [Rank 0] Trainer log: {'loss': 0.7605, 'grad_norm': 4.523890018463135, 'learning_rate': 7.177806027867395e-07} [Rank 2] Trainer log: {'loss': 0.7605, 'grad_norm': 4.523890018463135, 'learning_rate': 7.177806027867395e-07} {'loss': 0.7605, 'grad_norm': 4.523890018463135, 'learning_rate': 7.177806027867395e-07, 'epoch': 0.88} [Rank 0] Trainer log: {'loss': 0.7926, 'grad_norm': 8.286264419555664, 'learning_rate': 7.152458454903166e-07}[Rank 3] Trainer log: {'loss': 0.7926, 'grad_norm': 8.286264419555664, 'learning_rate': 7.152458454903166e-07}[Rank 1] Trainer log: {'loss': 0.7926, 'grad_norm': 8.286264419555664, 'learning_rate': 7.152458454903166e-07} [Rank 2] Trainer log: {'loss': 0.7926, 'grad_norm': 8.286264419555664, 'learning_rate': 7.152458454903166e-07} {'loss': 0.7926, 'grad_norm': 8.286264419555664, 'learning_rate': 7.152458454903166e-07, 'epoch': 0.88} [Rank 3] Trainer log: {'loss': 0.9201, 'grad_norm': 2.6637046337127686, 'learning_rate': 7.127154057074825e-07}[Rank 1] Trainer log: {'loss': 0.9201, 'grad_norm': 2.6637046337127686, 'learning_rate': 7.127154057074825e-07} [Rank 0] Trainer log: {'loss': 0.9201, 'grad_norm': 2.6637046337127686, 'learning_rate': 7.127154057074825e-07}[Rank 2] Trainer log: {'loss': 0.9201, 'grad_norm': 2.6637046337127686, 'learning_rate': 7.127154057074825e-07} {'loss': 0.9201, 'grad_norm': 2.6637046337127686, 'learning_rate': 7.127154057074825e-07, 'epoch': 0.89} [Rank 2] Trainer log: {'loss': 0.7051, 'grad_norm': 3.697519540786743, 'learning_rate': 7.10189284614915e-07}[Rank 3] Trainer log: {'loss': 0.7051, 'grad_norm': 3.697519540786743, 'learning_rate': 7.10189284614915e-07} [Rank 0] Trainer log: {'loss': 0.7051, 'grad_norm': 3.697519540786743, 'learning_rate': 7.10189284614915e-07} [Rank 1] Trainer log: {'loss': 0.7051, 'grad_norm': 3.697519540786743, 'learning_rate': 7.10189284614915e-07} {'loss': 0.7051, 'grad_norm': 3.697519540786743, 'learning_rate': 7.10189284614915e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8188, 'grad_norm': 1.8671938180923462, 'learning_rate': 7.076674833872921e-07} [Rank 1] Trainer log: {'loss': 0.8188, 'grad_norm': 1.8671938180923462, 'learning_rate': 7.076674833872921e-07}[Rank 0] Trainer log: {'loss': 0.8188, 'grad_norm': 1.8671938180923462, 'learning_rate': 7.076674833872921e-07} [Rank 2] Trainer log: {'loss': 0.8188, 'grad_norm': 1.8671938180923462, 'learning_rate': 7.076674833872921e-07} {'loss': 0.8188, 'grad_norm': 1.8671938180923462, 'learning_rate': 7.076674833872921e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.7996, 'grad_norm': 3.461564064025879, 'learning_rate': 7.05150003197278e-07}[Rank 1] Trainer log: {'loss': 0.7996, 'grad_norm': 3.461564064025879, 'learning_rate': 7.05150003197278e-07} [Rank 0] Trainer log: {'loss': 0.7996, 'grad_norm': 3.461564064025879, 'learning_rate': 7.05150003197278e-07} [Rank 2] Trainer log: {'loss': 0.7996, 'grad_norm': 3.461564064025879, 'learning_rate': 7.05150003197278e-07} {'loss': 0.7996, 'grad_norm': 3.461564064025879, 'learning_rate': 7.05150003197278e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8015, 'grad_norm': 6.064676284790039, 'learning_rate': 7.026368452155264e-07}[Rank 1] Trainer log: {'loss': 0.8015, 'grad_norm': 6.064676284790039, 'learning_rate': 7.026368452155264e-07} [Rank 2] Trainer log: {'loss': 0.8015, 'grad_norm': 6.064676284790039, 'learning_rate': 7.026368452155264e-07} [Rank 0] Trainer log: {'loss': 0.8015, 'grad_norm': 6.064676284790039, 'learning_rate': 7.026368452155264e-07} {'loss': 0.8015, 'grad_norm': 6.064676284790039, 'learning_rate': 7.026368452155264e-07, 'epoch': 0.89} [Rank 1] Trainer log: {'loss': 0.6955, 'grad_norm': 2.9137401580810547, 'learning_rate': 7.001280106106878e-07}[Rank 2] Trainer log: {'loss': 0.6955, 'grad_norm': 2.9137401580810547, 'learning_rate': 7.001280106106878e-07} [Rank 3] Trainer log: {'loss': 0.6955, 'grad_norm': 2.9137401580810547, 'learning_rate': 7.001280106106878e-07} [Rank 0] Trainer log: {'loss': 0.6955, 'grad_norm': 2.9137401580810547, 'learning_rate': 7.001280106106878e-07} {'loss': 0.6955, 'grad_norm': 2.9137401580810547, 'learning_rate': 7.001280106106878e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.7436, 'grad_norm': 3.2615809440612793, 'learning_rate': 6.976235005493959e-07}[Rank 1] Trainer log: {'loss': 0.7436, 'grad_norm': 3.2615809440612793, 'learning_rate': 6.976235005493959e-07} [Rank 2] Trainer log: {'loss': 0.7436, 'grad_norm': 3.2615809440612793, 'learning_rate': 6.976235005493959e-07} [Rank 0] Trainer log: {'loss': 0.7436, 'grad_norm': 3.2615809440612793, 'learning_rate': 6.976235005493959e-07} {'loss': 0.7436, 'grad_norm': 3.2615809440612793, 'learning_rate': 6.976235005493959e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.6491, 'grad_norm': 5.486334323883057, 'learning_rate': 6.951233161962745e-07}[Rank 1] Trainer log: {'loss': 0.6491, 'grad_norm': 5.486334323883057, 'learning_rate': 6.951233161962745e-07} [Rank 0] Trainer log: {'loss': 0.6491, 'grad_norm': 5.486334323883057, 'learning_rate': 6.951233161962745e-07}[Rank 2] Trainer log: {'loss': 0.6491, 'grad_norm': 5.486334323883057, 'learning_rate': 6.951233161962745e-07} {'loss': 0.6491, 'grad_norm': 5.486334323883057, 'learning_rate': 6.951233161962745e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.9339, 'grad_norm': 7.930050373077393, 'learning_rate': 6.926274587139381e-07} [Rank 1] Trainer log: {'loss': 0.9339, 'grad_norm': 7.930050373077393, 'learning_rate': 6.926274587139381e-07}[Rank 0] Trainer log: {'loss': 0.9339, 'grad_norm': 7.930050373077393, 'learning_rate': 6.926274587139381e-07} [Rank 2] Trainer log: {'loss': 0.9339, 'grad_norm': 7.930050373077393, 'learning_rate': 6.926274587139381e-07} {'loss': 0.9339, 'grad_norm': 7.930050373077393, 'learning_rate': 6.926274587139381e-07, 'epoch': 0.89} [Rank 1] Trainer log: {'loss': 0.9888, 'grad_norm': 3.0634994506835938, 'learning_rate': 6.901359292629906e-07} [Rank 3] Trainer log: {'loss': 0.9888, 'grad_norm': 3.0634994506835938, 'learning_rate': 6.901359292629906e-07} [Rank 2] Trainer log: {'loss': 0.9888, 'grad_norm': 3.0634994506835938, 'learning_rate': 6.901359292629906e-07}[Rank 0] Trainer log: {'loss': 0.9888, 'grad_norm': 3.0634994506835938, 'learning_rate': 6.901359292629906e-07} {'loss': 0.9888, 'grad_norm': 3.0634994506835938, 'learning_rate': 6.901359292629906e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 1.0524, 'grad_norm': 4.1132707595825195, 'learning_rate': 6.876487290020162e-07}[Rank 1] Trainer log: {'loss': 1.0524, 'grad_norm': 4.1132707595825195, 'learning_rate': 6.876487290020162e-07} [Rank 0] Trainer log: {'loss': 1.0524, 'grad_norm': 4.1132707595825195, 'learning_rate': 6.876487290020162e-07} [Rank 2] Trainer log: {'loss': 1.0524, 'grad_norm': 4.1132707595825195, 'learning_rate': 6.876487290020162e-07} {'loss': 1.0524, 'grad_norm': 4.1132707595825195, 'learning_rate': 6.876487290020162e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.9481, 'grad_norm': 5.6486077308654785, 'learning_rate': 6.851658590875953e-07}[Rank 3] Trainer log: {'loss': 0.9481, 'grad_norm': 5.6486077308654785, 'learning_rate': 6.851658590875953e-07} [Rank 2] Trainer log: {'loss': 0.9481, 'grad_norm': 5.6486077308654785, 'learning_rate': 6.851658590875953e-07} [Rank 1] Trainer log: {'loss': 0.9481, 'grad_norm': 5.6486077308654785, 'learning_rate': 6.851658590875953e-07} {'loss': 0.9481, 'grad_norm': 5.6486077308654785, 'learning_rate': 6.851658590875953e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.7116, 'grad_norm': 3.4354004859924316, 'learning_rate': 6.826873206742868e-07}[Rank 1] Trainer log: {'loss': 0.7116, 'grad_norm': 3.4354004859924316, 'learning_rate': 6.826873206742868e-07} [Rank 0] Trainer log: {'loss': 0.7116, 'grad_norm': 3.4354004859924316, 'learning_rate': 6.826873206742868e-07}[Rank 2] Trainer log: {'loss': 0.7116, 'grad_norm': 3.4354004859924316, 'learning_rate': 6.826873206742868e-07} {'loss': 0.7116, 'grad_norm': 3.4354004859924316, 'learning_rate': 6.826873206742868e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 1.0391, 'grad_norm': 3.3684275150299072, 'learning_rate': 6.802131149146374e-07} [Rank 1] Trainer log: {'loss': 1.0391, 'grad_norm': 3.3684275150299072, 'learning_rate': 6.802131149146374e-07}[Rank 0] Trainer log: {'loss': 1.0391, 'grad_norm': 3.3684275150299072, 'learning_rate': 6.802131149146374e-07} [Rank 2] Trainer log: {'loss': 1.0391, 'grad_norm': 3.3684275150299072, 'learning_rate': 6.802131149146374e-07} {'loss': 1.0391, 'grad_norm': 3.3684275150299072, 'learning_rate': 6.802131149146374e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.8038, 'grad_norm': 11.229547500610352, 'learning_rate': 6.777432429591846e-07}[Rank 3] Trainer log: {'loss': 0.8038, 'grad_norm': 11.229547500610352, 'learning_rate': 6.777432429591846e-07}[Rank 1] Trainer log: {'loss': 0.8038, 'grad_norm': 11.229547500610352, 'learning_rate': 6.777432429591846e-07} [Rank 2] Trainer log: {'loss': 0.8038, 'grad_norm': 11.229547500610352, 'learning_rate': 6.777432429591846e-07} {'loss': 0.8038, 'grad_norm': 11.229547500610352, 'learning_rate': 6.777432429591846e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.7658, 'grad_norm': 2.276210069656372, 'learning_rate': 6.752777059564431e-07}[Rank 2] Trainer log: {'loss': 0.7658, 'grad_norm': 2.276210069656372, 'learning_rate': 6.752777059564431e-07} [Rank 3] Trainer log: {'loss': 0.7658, 'grad_norm': 2.276210069656372, 'learning_rate': 6.752777059564431e-07}[Rank 1] Trainer log: {'loss': 0.7658, 'grad_norm': 2.276210069656372, 'learning_rate': 6.752777059564431e-07} {'loss': 0.7658, 'grad_norm': 2.276210069656372, 'learning_rate': 6.752777059564431e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8911, 'grad_norm': 1.6645426750183105, 'learning_rate': 6.728165050529146e-07}[Rank 2] Trainer log: {'loss': 0.8911, 'grad_norm': 1.6645426750183105, 'learning_rate': 6.728165050529146e-07}[Rank 1] Trainer log: {'loss': 0.8911, 'grad_norm': 1.6645426750183105, 'learning_rate': 6.728165050529146e-07} [Rank 0] Trainer log: {'loss': 0.8911, 'grad_norm': 1.6645426750183105, 'learning_rate': 6.728165050529146e-07} {'loss': 0.8911, 'grad_norm': 1.6645426750183105, 'learning_rate': 6.728165050529146e-07, 'epoch': 0.89} [Rank 2] Trainer log: {'loss': 0.9268, 'grad_norm': 6.007839679718018, 'learning_rate': 6.703596413930858e-07}[Rank 1] Trainer log: {'loss': 0.9268, 'grad_norm': 6.007839679718018, 'learning_rate': 6.703596413930858e-07} [Rank 3] Trainer log: {'loss': 0.9268, 'grad_norm': 6.007839679718018, 'learning_rate': 6.703596413930858e-07} [Rank 0] Trainer log: {'loss': 0.9268, 'grad_norm': 6.007839679718018, 'learning_rate': 6.703596413930858e-07} {'loss': 0.9268, 'grad_norm': 6.007839679718018, 'learning_rate': 6.703596413930858e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.9225, 'grad_norm': 5.7755255699157715, 'learning_rate': 6.679071161194261e-07} [Rank 2] Trainer log: {'loss': 0.9225, 'grad_norm': 5.7755255699157715, 'learning_rate': 6.679071161194261e-07} [Rank 1] Trainer log: {'loss': 0.9225, 'grad_norm': 5.7755255699157715, 'learning_rate': 6.679071161194261e-07} [Rank 0] Trainer log: {'loss': 0.9225, 'grad_norm': 5.7755255699157715, 'learning_rate': 6.679071161194261e-07} {'loss': 0.9225, 'grad_norm': 5.7755255699157715, 'learning_rate': 6.679071161194261e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 1.0301, 'grad_norm': 3.3914635181427, 'learning_rate': 6.654589303723858e-07}[Rank 1] Trainer log: {'loss': 1.0301, 'grad_norm': 3.3914635181427, 'learning_rate': 6.654589303723858e-07}[Rank 0] Trainer log: {'loss': 1.0301, 'grad_norm': 3.3914635181427, 'learning_rate': 6.654589303723858e-07} [Rank 2] Trainer log: {'loss': 1.0301, 'grad_norm': 3.3914635181427, 'learning_rate': 6.654589303723858e-07} {'loss': 1.0301, 'grad_norm': 3.3914635181427, 'learning_rate': 6.654589303723858e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8389, 'grad_norm': 7.431430816650391, 'learning_rate': 6.630150852904005e-07} [Rank 0] Trainer log: {'loss': 0.8389, 'grad_norm': 7.431430816650391, 'learning_rate': 6.630150852904005e-07}[Rank 2] Trainer log: {'loss': 0.8389, 'grad_norm': 7.431430816650391, 'learning_rate': 6.630150852904005e-07}[Rank 1] Trainer log: {'loss': 0.8389, 'grad_norm': 7.431430816650391, 'learning_rate': 6.630150852904005e-07} {'loss': 0.8389, 'grad_norm': 7.431430816650391, 'learning_rate': 6.630150852904005e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8178, 'grad_norm': 5.046987056732178, 'learning_rate': 6.605755820098825e-07}[Rank 0] Trainer log: {'loss': 0.8178, 'grad_norm': 5.046987056732178, 'learning_rate': 6.605755820098825e-07} [Rank 1] Trainer log: {'loss': 0.8178, 'grad_norm': 5.046987056732178, 'learning_rate': 6.605755820098825e-07} [Rank 2] Trainer log: {'loss': 0.8178, 'grad_norm': 5.046987056732178, 'learning_rate': 6.605755820098825e-07} {'loss': 0.8178, 'grad_norm': 5.046987056732178, 'learning_rate': 6.605755820098825e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 1.0782, 'grad_norm': 2.3082315921783447, 'learning_rate': 6.581404216652298e-07} [Rank 0] Trainer log: {'loss': 1.0782, 'grad_norm': 2.3082315921783447, 'learning_rate': 6.581404216652298e-07}[Rank 2] Trainer log: {'loss': 1.0782, 'grad_norm': 2.3082315921783447, 'learning_rate': 6.581404216652298e-07} [Rank 1] Trainer log: {'loss': 1.0782, 'grad_norm': 2.3082315921783447, 'learning_rate': 6.581404216652298e-07} {'loss': 1.0782, 'grad_norm': 2.3082315921783447, 'learning_rate': 6.581404216652298e-07, 'epoch': 0.89} [Rank 1] Trainer log: {'loss': 0.8453, 'grad_norm': 3.687462568283081, 'learning_rate': 6.557096053888167e-07} [Rank 0] Trainer log: {'loss': 0.8453, 'grad_norm': 3.687462568283081, 'learning_rate': 6.557096053888167e-07}[Rank 3] Trainer log: {'loss': 0.8453, 'grad_norm': 3.687462568283081, 'learning_rate': 6.557096053888167e-07} [Rank 2] Trainer log: {'loss': 0.8453, 'grad_norm': 3.687462568283081, 'learning_rate': 6.557096053888167e-07} {'loss': 0.8453, 'grad_norm': 3.687462568283081, 'learning_rate': 6.557096053888167e-07, 'epoch': 0.89} [Rank 1] Trainer log: {'loss': 1.0009, 'grad_norm': 3.3851373195648193, 'learning_rate': 6.532831343110036e-07}[Rank 3] Trainer log: {'loss': 1.0009, 'grad_norm': 3.3851373195648193, 'learning_rate': 6.532831343110036e-07} [Rank 2] Trainer log: {'loss': 1.0009, 'grad_norm': 3.3851373195648193, 'learning_rate': 6.532831343110036e-07}[Rank 0] Trainer log: {'loss': 1.0009, 'grad_norm': 3.3851373195648193, 'learning_rate': 6.532831343110036e-07} {'loss': 1.0009, 'grad_norm': 3.3851373195648193, 'learning_rate': 6.532831343110036e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 1.0792, 'grad_norm': 2.5449130535125732, 'learning_rate': 6.508610095601209e-07}[Rank 1] Trainer log: {'loss': 1.0792, 'grad_norm': 2.5449130535125732, 'learning_rate': 6.508610095601209e-07} [Rank 2] Trainer log: {'loss': 1.0792, 'grad_norm': 2.5449130535125732, 'learning_rate': 6.508610095601209e-07} [Rank 3] Trainer log: {'loss': 1.0792, 'grad_norm': 2.5449130535125732, 'learning_rate': 6.508610095601209e-07} {'loss': 1.0792, 'grad_norm': 2.5449130535125732, 'learning_rate': 6.508610095601209e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.7308, 'grad_norm': 5.694406986236572, 'learning_rate': 6.484432322624878e-07}[Rank 1] Trainer log: {'loss': 0.7308, 'grad_norm': 5.694406986236572, 'learning_rate': 6.484432322624878e-07}[Rank 2] Trainer log: {'loss': 0.7308, 'grad_norm': 5.694406986236572, 'learning_rate': 6.484432322624878e-07} [Rank 3] Trainer log: {'loss': 0.7308, 'grad_norm': 5.694406986236572, 'learning_rate': 6.484432322624878e-07} {'loss': 0.7308, 'grad_norm': 5.694406986236572, 'learning_rate': 6.484432322624878e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 1.0582, 'grad_norm': 7.293266296386719, 'learning_rate': 6.460298035423973e-07}[Rank 1] Trainer log: {'loss': 1.0582, 'grad_norm': 7.293266296386719, 'learning_rate': 6.460298035423973e-07} [Rank 3] Trainer log: {'loss': 1.0582, 'grad_norm': 7.293266296386719, 'learning_rate': 6.460298035423973e-07} [Rank 2] Trainer log: {'loss': 1.0582, 'grad_norm': 7.293266296386719, 'learning_rate': 6.460298035423973e-07} {'loss': 1.0582, 'grad_norm': 7.293266296386719, 'learning_rate': 6.460298035423973e-07, 'epoch': 0.89} [Rank 2] Trainer log: {'loss': 0.7906, 'grad_norm': 2.2955729961395264, 'learning_rate': 6.436207245221204e-07} [Rank 1] Trainer log: {'loss': 0.7906, 'grad_norm': 2.2955729961395264, 'learning_rate': 6.436207245221204e-07}[Rank 0] Trainer log: {'loss': 0.7906, 'grad_norm': 2.2955729961395264, 'learning_rate': 6.436207245221204e-07} [Rank 3] Trainer log: {'loss': 0.7906, 'grad_norm': 2.2955729961395264, 'learning_rate': 6.436207245221204e-07} {'loss': 0.7906, 'grad_norm': 2.2955729961395264, 'learning_rate': 6.436207245221204e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8621, 'grad_norm': 7.184272766113281, 'learning_rate': 6.412159963219023e-07} [Rank 1] Trainer log: {'loss': 0.8621, 'grad_norm': 7.184272766113281, 'learning_rate': 6.412159963219023e-07} [Rank 0] Trainer log: {'loss': 0.8621, 'grad_norm': 7.184272766113281, 'learning_rate': 6.412159963219023e-07} [Rank 2] Trainer log: {'loss': 0.8621, 'grad_norm': 7.184272766113281, 'learning_rate': 6.412159963219023e-07} {'loss': 0.8621, 'grad_norm': 7.184272766113281, 'learning_rate': 6.412159963219023e-07, 'epoch': 0.89} [Rank 2] Trainer log: {'loss': 0.896, 'grad_norm': 2.193225145339966, 'learning_rate': 6.388156200599726e-07}[Rank 1] Trainer log: {'loss': 0.896, 'grad_norm': 2.193225145339966, 'learning_rate': 6.388156200599726e-07}[Rank 3] Trainer log: {'loss': 0.896, 'grad_norm': 2.193225145339966, 'learning_rate': 6.388156200599726e-07} [Rank 0] Trainer log: {'loss': 0.896, 'grad_norm': 2.193225145339966, 'learning_rate': 6.388156200599726e-07} {'loss': 0.896, 'grad_norm': 2.193225145339966, 'learning_rate': 6.388156200599726e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8114, 'grad_norm': 3.8707504272460938, 'learning_rate': 6.3641959685253e-07}[Rank 0] Trainer log: {'loss': 0.8114, 'grad_norm': 3.8707504272460938, 'learning_rate': 6.3641959685253e-07}[Rank 1] Trainer log: {'loss': 0.8114, 'grad_norm': 3.8707504272460938, 'learning_rate': 6.3641959685253e-07} [Rank 2] Trainer log: {'loss': 0.8114, 'grad_norm': 3.8707504272460938, 'learning_rate': 6.3641959685253e-07} {'loss': 0.8114, 'grad_norm': 3.8707504272460938, 'learning_rate': 6.3641959685253e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.9907, 'grad_norm': 2.2368855476379395, 'learning_rate': 6.340279278137518e-07}[Rank 3] Trainer log: {'loss': 0.9907, 'grad_norm': 2.2368855476379395, 'learning_rate': 6.340279278137518e-07} [Rank 1] Trainer log: {'loss': 0.9907, 'grad_norm': 2.2368855476379395, 'learning_rate': 6.340279278137518e-07} [Rank 2] Trainer log: {'loss': 0.9907, 'grad_norm': 2.2368855476379395, 'learning_rate': 6.340279278137518e-07} {'loss': 0.9907, 'grad_norm': 2.2368855476379395, 'learning_rate': 6.340279278137518e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.9005, 'grad_norm': 3.7600481510162354, 'learning_rate': 6.31640614055794e-07} [Rank 2] Trainer log: {'loss': 0.9005, 'grad_norm': 3.7600481510162354, 'learning_rate': 6.31640614055794e-07}[Rank 0] Trainer log: {'loss': 0.9005, 'grad_norm': 3.7600481510162354, 'learning_rate': 6.31640614055794e-07}[Rank 1] Trainer log: {'loss': 0.9005, 'grad_norm': 3.7600481510162354, 'learning_rate': 6.31640614055794e-07} {'loss': 0.9005, 'grad_norm': 3.7600481510162354, 'learning_rate': 6.31640614055794e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.7007, 'grad_norm': 5.15255069732666, 'learning_rate': 6.292576566887787e-07}[Rank 1] Trainer log: {'loss': 0.7007, 'grad_norm': 5.15255069732666, 'learning_rate': 6.292576566887787e-07} [Rank 0] Trainer log: {'loss': 0.7007, 'grad_norm': 5.15255069732666, 'learning_rate': 6.292576566887787e-07}[Rank 2] Trainer log: {'loss': 0.7007, 'grad_norm': 5.15255069732666, 'learning_rate': 6.292576566887787e-07} {'loss': 0.7007, 'grad_norm': 5.15255069732666, 'learning_rate': 6.292576566887787e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.7336, 'grad_norm': 8.90782642364502, 'learning_rate': 6.268790568208116e-07}[Rank 1] Trainer log: {'loss': 0.7336, 'grad_norm': 8.90782642364502, 'learning_rate': 6.268790568208116e-07}[Rank 3] Trainer log: {'loss': 0.7336, 'grad_norm': 8.90782642364502, 'learning_rate': 6.268790568208116e-07} [Rank 2] Trainer log: {'loss': 0.7336, 'grad_norm': 8.90782642364502, 'learning_rate': 6.268790568208116e-07} {'loss': 0.7336, 'grad_norm': 8.90782642364502, 'learning_rate': 6.268790568208116e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.9483, 'grad_norm': 5.516266345977783, 'learning_rate': 6.24504815557967e-07}[Rank 0] Trainer log: {'loss': 0.9483, 'grad_norm': 5.516266345977783, 'learning_rate': 6.24504815557967e-07}[Rank 1] Trainer log: {'loss': 0.9483, 'grad_norm': 5.516266345977783, 'learning_rate': 6.24504815557967e-07} [Rank 2] Trainer log: {'loss': 0.9483, 'grad_norm': 5.516266345977783, 'learning_rate': 6.24504815557967e-07} {'loss': 0.9483, 'grad_norm': 5.516266345977783, 'learning_rate': 6.24504815557967e-07, 'epoch': 0.89} [Rank 1] Trainer log: {'loss': 0.9115, 'grad_norm': 5.219529628753662, 'learning_rate': 6.221349340042937e-07}[Rank 3] Trainer log: {'loss': 0.9115, 'grad_norm': 5.219529628753662, 'learning_rate': 6.221349340042937e-07} [Rank 2] Trainer log: {'loss': 0.9115, 'grad_norm': 5.219529628753662, 'learning_rate': 6.221349340042937e-07} [Rank 0] Trainer log: {'loss': 0.9115, 'grad_norm': 5.219529628753662, 'learning_rate': 6.221349340042937e-07} {'loss': 0.9115, 'grad_norm': 5.219529628753662, 'learning_rate': 6.221349340042937e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.799, 'grad_norm': 3.442587375640869, 'learning_rate': 6.197694132618115e-07}[Rank 1] Trainer log: {'loss': 0.799, 'grad_norm': 3.442587375640869, 'learning_rate': 6.197694132618115e-07} [Rank 3] Trainer log: {'loss': 0.799, 'grad_norm': 3.442587375640869, 'learning_rate': 6.197694132618115e-07} [Rank 2] Trainer log: {'loss': 0.799, 'grad_norm': 3.442587375640869, 'learning_rate': 6.197694132618115e-07} {'loss': 0.799, 'grad_norm': 3.442587375640869, 'learning_rate': 6.197694132618115e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.6708, 'grad_norm': 7.776933193206787, 'learning_rate': 6.174082544305149e-07}[Rank 1] Trainer log: {'loss': 0.6708, 'grad_norm': 7.776933193206787, 'learning_rate': 6.174082544305149e-07}[Rank 2] Trainer log: {'loss': 0.6708, 'grad_norm': 7.776933193206787, 'learning_rate': 6.174082544305149e-07} [Rank 0] Trainer log: {'loss': 0.6708, 'grad_norm': 7.776933193206787, 'learning_rate': 6.174082544305149e-07} {'loss': 0.6708, 'grad_norm': 7.776933193206787, 'learning_rate': 6.174082544305149e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8275, 'grad_norm': 4.444672107696533, 'learning_rate': 6.1505145860837e-07} [Rank 0] Trainer log: {'loss': 0.8275, 'grad_norm': 4.444672107696533, 'learning_rate': 6.1505145860837e-07}[Rank 2] Trainer log: {'loss': 0.8275, 'grad_norm': 4.444672107696533, 'learning_rate': 6.1505145860837e-07} [Rank 1] Trainer log: {'loss': 0.8275, 'grad_norm': 4.444672107696533, 'learning_rate': 6.1505145860837e-07} {'loss': 0.8275, 'grad_norm': 4.444672107696533, 'learning_rate': 6.1505145860837e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8794, 'grad_norm': 4.909470558166504, 'learning_rate': 6.126990268913091e-07} [Rank 1] Trainer log: {'loss': 0.8794, 'grad_norm': 4.909470558166504, 'learning_rate': 6.126990268913091e-07}[Rank 0] Trainer log: {'loss': 0.8794, 'grad_norm': 4.909470558166504, 'learning_rate': 6.126990268913091e-07} [Rank 2] Trainer log: {'loss': 0.8794, 'grad_norm': 4.909470558166504, 'learning_rate': 6.126990268913091e-07} {'loss': 0.8794, 'grad_norm': 4.909470558166504, 'learning_rate': 6.126990268913091e-07, 'epoch': 0.89} [Rank 1] Trainer log: {'loss': 0.691, 'grad_norm': 6.546082019805908, 'learning_rate': 6.103509603732416e-07}[Rank 0] Trainer log: {'loss': 0.691, 'grad_norm': 6.546082019805908, 'learning_rate': 6.103509603732416e-07} [Rank 3] Trainer log: {'loss': 0.691, 'grad_norm': 6.546082019805908, 'learning_rate': 6.103509603732416e-07} [Rank 2] Trainer log: {'loss': 0.691, 'grad_norm': 6.546082019805908, 'learning_rate': 6.103509603732416e-07} {'loss': 0.691, 'grad_norm': 6.546082019805908, 'learning_rate': 6.103509603732416e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.921, 'grad_norm': 5.066746234893799, 'learning_rate': 6.080072601460451e-07} [Rank 1] Trainer log: {'loss': 0.921, 'grad_norm': 5.066746234893799, 'learning_rate': 6.080072601460451e-07} [Rank 2] Trainer log: {'loss': 0.921, 'grad_norm': 5.066746234893799, 'learning_rate': 6.080072601460451e-07} [Rank 0] Trainer log: {'loss': 0.921, 'grad_norm': 5.066746234893799, 'learning_rate': 6.080072601460451e-07} {'loss': 0.921, 'grad_norm': 5.066746234893799, 'learning_rate': 6.080072601460451e-07, 'epoch': 0.89} [Rank 2] Trainer log: {'loss': 0.7493, 'grad_norm': 9.553936958312988, 'learning_rate': 6.056679272995647e-07}[Rank 3] Trainer log: {'loss': 0.7493, 'grad_norm': 9.553936958312988, 'learning_rate': 6.056679272995647e-07}[Rank 1] Trainer log: {'loss': 0.7493, 'grad_norm': 9.553936958312988, 'learning_rate': 6.056679272995647e-07} [Rank 0] Trainer log: {'loss': 0.7493, 'grad_norm': 9.553936958312988, 'learning_rate': 6.056679272995647e-07} {'loss': 0.7493, 'grad_norm': 9.553936958312988, 'learning_rate': 6.056679272995647e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.7182, 'grad_norm': 8.36179256439209, 'learning_rate': 6.033329629216189e-07}[Rank 3] Trainer log: {'loss': 0.7182, 'grad_norm': 8.36179256439209, 'learning_rate': 6.033329629216189e-07}[Rank 1] Trainer log: {'loss': 0.7182, 'grad_norm': 8.36179256439209, 'learning_rate': 6.033329629216189e-07} [Rank 2] Trainer log: {'loss': 0.7182, 'grad_norm': 8.36179256439209, 'learning_rate': 6.033329629216189e-07} {'loss': 0.7182, 'grad_norm': 8.36179256439209, 'learning_rate': 6.033329629216189e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8551, 'grad_norm': 2.233321189880371, 'learning_rate': 6.010023680979893e-07}[Rank 1] Trainer log: {'loss': 0.8551, 'grad_norm': 2.233321189880371, 'learning_rate': 6.010023680979893e-07} [Rank 2] Trainer log: {'loss': 0.8551, 'grad_norm': 2.233321189880371, 'learning_rate': 6.010023680979893e-07} [Rank 0] Trainer log: {'loss': 0.8551, 'grad_norm': 2.233321189880371, 'learning_rate': 6.010023680979893e-07} {'loss': 0.8551, 'grad_norm': 2.233321189880371, 'learning_rate': 6.010023680979893e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.9446, 'grad_norm': 5.3710737228393555, 'learning_rate': 5.986761439124289e-07} [Rank 2] Trainer log: {'loss': 0.9446, 'grad_norm': 5.3710737228393555, 'learning_rate': 5.986761439124289e-07} [Rank 0] Trainer log: {'loss': 0.9446, 'grad_norm': 5.3710737228393555, 'learning_rate': 5.986761439124289e-07}[Rank 1] Trainer log: {'loss': 0.9446, 'grad_norm': 5.3710737228393555, 'learning_rate': 5.986761439124289e-07} {'loss': 0.9446, 'grad_norm': 5.3710737228393555, 'learning_rate': 5.986761439124289e-07, 'epoch': 0.89} [Rank 3] Trainer log: {'loss': 0.8222, 'grad_norm': 5.1239848136901855, 'learning_rate': 5.963542914466569e-07} [Rank 0] Trainer log: {'loss': 0.8222, 'grad_norm': 5.1239848136901855, 'learning_rate': 5.963542914466569e-07}[Rank 1] Trainer log: {'loss': 0.8222, 'grad_norm': 5.1239848136901855, 'learning_rate': 5.963542914466569e-07} [Rank 2] Trainer log: {'loss': 0.8222, 'grad_norm': 5.1239848136901855, 'learning_rate': 5.963542914466569e-07} {'loss': 0.8222, 'grad_norm': 5.1239848136901855, 'learning_rate': 5.963542914466569e-07, 'epoch': 0.89} [Rank 0] Trainer log: {'loss': 0.9728, 'grad_norm': 2.581252336502075, 'learning_rate': 5.94036811780363e-07}[Rank 3] Trainer log: {'loss': 0.9728, 'grad_norm': 2.581252336502075, 'learning_rate': 5.94036811780363e-07}[Rank 1] Trainer log: {'loss': 0.9728, 'grad_norm': 2.581252336502075, 'learning_rate': 5.94036811780363e-07} [Rank 2] Trainer log: {'loss': 0.9728, 'grad_norm': 2.581252336502075, 'learning_rate': 5.94036811780363e-07} {'loss': 0.9728, 'grad_norm': 2.581252336502075, 'learning_rate': 5.94036811780363e-07, 'epoch': 0.9} [Rank 1] Trainer log: {'loss': 0.8425, 'grad_norm': 2.5363917350769043, 'learning_rate': 5.917237059911963e-07}[Rank 3] Trainer log: {'loss': 0.8425, 'grad_norm': 2.5363917350769043, 'learning_rate': 5.917237059911963e-07}[Rank 0] Trainer log: {'loss': 0.8425, 'grad_norm': 2.5363917350769043, 'learning_rate': 5.917237059911963e-07} [Rank 2] Trainer log: {'loss': 0.8425, 'grad_norm': 2.5363917350769043, 'learning_rate': 5.917237059911963e-07} {'loss': 0.8425, 'grad_norm': 2.5363917350769043, 'learning_rate': 5.917237059911963e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.6119, 'grad_norm': 8.582449913024902, 'learning_rate': 5.894149751547806e-07}[Rank 1] Trainer log: {'loss': 0.6119, 'grad_norm': 8.582449913024902, 'learning_rate': 5.894149751547806e-07} [Rank 0] Trainer log: {'loss': 0.6119, 'grad_norm': 8.582449913024902, 'learning_rate': 5.894149751547806e-07}[Rank 2] Trainer log: {'loss': 0.6119, 'grad_norm': 8.582449913024902, 'learning_rate': 5.894149751547806e-07} {'loss': 0.6119, 'grad_norm': 8.582449913024902, 'learning_rate': 5.894149751547806e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.9873, 'grad_norm': 9.198247909545898, 'learning_rate': 5.871106203447019e-07}[Rank 2] Trainer log: {'loss': 0.9873, 'grad_norm': 9.198247909545898, 'learning_rate': 5.871106203447019e-07} [Rank 0] Trainer log: {'loss': 0.9873, 'grad_norm': 9.198247909545898, 'learning_rate': 5.871106203447019e-07}[Rank 1] Trainer log: {'loss': 0.9873, 'grad_norm': 9.198247909545898, 'learning_rate': 5.871106203447019e-07} {'loss': 0.9873, 'grad_norm': 9.198247909545898, 'learning_rate': 5.871106203447019e-07, 'epoch': 0.9} [Rank 1] Trainer log: {'loss': 0.8318, 'grad_norm': 4.501786231994629, 'learning_rate': 5.848106426325095e-07}[Rank 3] Trainer log: {'loss': 0.8318, 'grad_norm': 4.501786231994629, 'learning_rate': 5.848106426325095e-07} [Rank 2] Trainer log: {'loss': 0.8318, 'grad_norm': 4.501786231994629, 'learning_rate': 5.848106426325095e-07} [Rank 0] Trainer log: {'loss': 0.8318, 'grad_norm': 4.501786231994629, 'learning_rate': 5.848106426325095e-07} {'loss': 0.8318, 'grad_norm': 4.501786231994629, 'learning_rate': 5.848106426325095e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.7579, 'grad_norm': 4.943948268890381, 'learning_rate': 5.825150430877157e-07}[Rank 0] Trainer log: {'loss': 0.7579, 'grad_norm': 4.943948268890381, 'learning_rate': 5.825150430877157e-07}[Rank 1] Trainer log: {'loss': 0.7579, 'grad_norm': 4.943948268890381, 'learning_rate': 5.825150430877157e-07} [Rank 2] Trainer log: {'loss': 0.7579, 'grad_norm': 4.943948268890381, 'learning_rate': 5.825150430877157e-07} {'loss': 0.7579, 'grad_norm': 4.943948268890381, 'learning_rate': 5.825150430877157e-07, 'epoch': 0.9} [Rank 2] Trainer log: {'loss': 1.0215, 'grad_norm': 2.7522549629211426, 'learning_rate': 5.802238227778045e-07}[Rank 3] Trainer log: {'loss': 1.0215, 'grad_norm': 2.7522549629211426, 'learning_rate': 5.802238227778045e-07} [Rank 1] Trainer log: {'loss': 1.0215, 'grad_norm': 2.7522549629211426, 'learning_rate': 5.802238227778045e-07} [Rank 0] Trainer log: {'loss': 1.0215, 'grad_norm': 2.7522549629211426, 'learning_rate': 5.802238227778045e-07} {'loss': 1.0215, 'grad_norm': 2.7522549629211426, 'learning_rate': 5.802238227778045e-07, 'epoch': 0.9} [Rank 0] Trainer log: {'loss': 0.738, 'grad_norm': 4.653674125671387, 'learning_rate': 5.779369827682158e-07}[Rank 1] Trainer log: {'loss': 0.738, 'grad_norm': 4.653674125671387, 'learning_rate': 5.779369827682158e-07} [Rank 3] Trainer log: {'loss': 0.738, 'grad_norm': 4.653674125671387, 'learning_rate': 5.779369827682158e-07} [Rank 2] Trainer log: {'loss': 0.738, 'grad_norm': 4.653674125671387, 'learning_rate': 5.779369827682158e-07} {'loss': 0.738, 'grad_norm': 4.653674125671387, 'learning_rate': 5.779369827682158e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.7365, 'grad_norm': 4.448984622955322, 'learning_rate': 5.756545241223554e-07}[Rank 0] Trainer log: {'loss': 0.7365, 'grad_norm': 4.448984622955322, 'learning_rate': 5.756545241223554e-07} [Rank 2] Trainer log: {'loss': 0.7365, 'grad_norm': 4.448984622955322, 'learning_rate': 5.756545241223554e-07} [Rank 1] Trainer log: {'loss': 0.7365, 'grad_norm': 4.448984622955322, 'learning_rate': 5.756545241223554e-07} {'loss': 0.7365, 'grad_norm': 4.448984622955322, 'learning_rate': 5.756545241223554e-07, 'epoch': 0.9} [Rank 2] Trainer log: {'loss': 0.9334, 'grad_norm': 3.2630815505981445, 'learning_rate': 5.733764479015935e-07} [Rank 0] Trainer log: {'loss': 0.9334, 'grad_norm': 3.2630815505981445, 'learning_rate': 5.733764479015935e-07} [Rank 1] Trainer log: {'loss': 0.9334, 'grad_norm': 3.2630815505981445, 'learning_rate': 5.733764479015935e-07} [Rank 3] Trainer log: {'loss': 0.9334, 'grad_norm': 3.2630815505981445, 'learning_rate': 5.733764479015935e-07} {'loss': 0.9334, 'grad_norm': 3.2630815505981445, 'learning_rate': 5.733764479015935e-07, 'epoch': 0.9} [Rank 2] Trainer log: {'loss': 0.6217, 'grad_norm': 7.506957530975342, 'learning_rate': 5.711027551652593e-07} [Rank 3] Trainer log: {'loss': 0.6217, 'grad_norm': 7.506957530975342, 'learning_rate': 5.711027551652593e-07}[Rank 0] Trainer log: {'loss': 0.6217, 'grad_norm': 7.506957530975342, 'learning_rate': 5.711027551652593e-07} [Rank 1] Trainer log: {'loss': 0.6217, 'grad_norm': 7.506957530975342, 'learning_rate': 5.711027551652593e-07} {'loss': 0.6217, 'grad_norm': 7.506957530975342, 'learning_rate': 5.711027551652593e-07, 'epoch': 0.9} [Rank 0] Trainer log: {'loss': 0.9234, 'grad_norm': 8.1272554397583, 'learning_rate': 5.688334469706446e-07}[Rank 3] Trainer log: {'loss': 0.9234, 'grad_norm': 8.1272554397583, 'learning_rate': 5.688334469706446e-07}[Rank 1] Trainer log: {'loss': 0.9234, 'grad_norm': 8.1272554397583, 'learning_rate': 5.688334469706446e-07} [Rank 2] Trainer log: {'loss': 0.9234, 'grad_norm': 8.1272554397583, 'learning_rate': 5.688334469706446e-07} {'loss': 0.9234, 'grad_norm': 8.1272554397583, 'learning_rate': 5.688334469706446e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.9453, 'grad_norm': 4.378366947174072, 'learning_rate': 5.665685243730068e-07}[Rank 0] Trainer log: {'loss': 0.9453, 'grad_norm': 4.378366947174072, 'learning_rate': 5.665685243730068e-07} [Rank 2] Trainer log: {'loss': 0.9453, 'grad_norm': 4.378366947174072, 'learning_rate': 5.665685243730068e-07}[Rank 1] Trainer log: {'loss': 0.9453, 'grad_norm': 4.378366947174072, 'learning_rate': 5.665685243730068e-07} {'loss': 0.9453, 'grad_norm': 4.378366947174072, 'learning_rate': 5.665685243730068e-07, 'epoch': 0.9} [Rank 1] Trainer log: {'loss': 1.0494, 'grad_norm': 3.2552075386047363, 'learning_rate': 5.643079884255565e-07} [Rank 3] Trainer log: {'loss': 1.0494, 'grad_norm': 3.2552075386047363, 'learning_rate': 5.643079884255565e-07} [Rank 0] Trainer log: {'loss': 1.0494, 'grad_norm': 3.2552075386047363, 'learning_rate': 5.643079884255565e-07}[Rank 2] Trainer log: {'loss': 1.0494, 'grad_norm': 3.2552075386047363, 'learning_rate': 5.643079884255565e-07} {'loss': 1.0494, 'grad_norm': 3.2552075386047363, 'learning_rate': 5.643079884255565e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.8545, 'grad_norm': 22.334938049316406, 'learning_rate': 5.620518401794672e-07} [Rank 0] Trainer log: {'loss': 0.8545, 'grad_norm': 22.334938049316406, 'learning_rate': 5.620518401794672e-07}[Rank 1] Trainer log: {'loss': 0.8545, 'grad_norm': 22.334938049316406, 'learning_rate': 5.620518401794672e-07} [Rank 2] Trainer log: {'loss': 0.8545, 'grad_norm': 22.334938049316406, 'learning_rate': 5.620518401794672e-07} {'loss': 0.8545, 'grad_norm': 22.334938049316406, 'learning_rate': 5.620518401794672e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.8423, 'grad_norm': 4.628236293792725, 'learning_rate': 5.598000806838766e-07}[Rank 2] Trainer log: {'loss': 0.8423, 'grad_norm': 4.628236293792725, 'learning_rate': 5.598000806838766e-07} [Rank 1] Trainer log: {'loss': 0.8423, 'grad_norm': 4.628236293792725, 'learning_rate': 5.598000806838766e-07} [Rank 0] Trainer log: {'loss': 0.8423, 'grad_norm': 4.628236293792725, 'learning_rate': 5.598000806838766e-07} {'loss': 0.8423, 'grad_norm': 4.628236293792725, 'learning_rate': 5.598000806838766e-07, 'epoch': 0.9} [Rank 1] Trainer log: {'loss': 0.8247, 'grad_norm': 3.0932765007019043, 'learning_rate': 5.575527109858747e-07} [Rank 2] Trainer log: {'loss': 0.8247, 'grad_norm': 3.0932765007019043, 'learning_rate': 5.575527109858747e-07} [Rank 3] Trainer log: {'loss': 0.8247, 'grad_norm': 3.0932765007019043, 'learning_rate': 5.575527109858747e-07}[Rank 0] Trainer log: {'loss': 0.8247, 'grad_norm': 3.0932765007019043, 'learning_rate': 5.575527109858747e-07} {'loss': 0.8247, 'grad_norm': 3.0932765007019043, 'learning_rate': 5.575527109858747e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 1.0044, 'grad_norm': 4.8956298828125, 'learning_rate': 5.553097321305134e-07} [Rank 0] Trainer log: {'loss': 1.0044, 'grad_norm': 4.8956298828125, 'learning_rate': 5.553097321305134e-07}[Rank 2] Trainer log: {'loss': 1.0044, 'grad_norm': 4.8956298828125, 'learning_rate': 5.553097321305134e-07} [Rank 1] Trainer log: {'loss': 1.0044, 'grad_norm': 4.8956298828125, 'learning_rate': 5.553097321305134e-07} {'loss': 1.0044, 'grad_norm': 4.8956298828125, 'learning_rate': 5.553097321305134e-07, 'epoch': 0.9} [Rank 2] Trainer log: {'loss': 0.7957, 'grad_norm': 2.652259111404419, 'learning_rate': 5.53071145160804e-07}[Rank 0] Trainer log: {'loss': 0.7957, 'grad_norm': 2.652259111404419, 'learning_rate': 5.53071145160804e-07}[Rank 3] Trainer log: {'loss': 0.7957, 'grad_norm': 2.652259111404419, 'learning_rate': 5.53071145160804e-07} [Rank 1] Trainer log: {'loss': 0.7957, 'grad_norm': 2.652259111404419, 'learning_rate': 5.53071145160804e-07} {'loss': 0.7957, 'grad_norm': 2.652259111404419, 'learning_rate': 5.53071145160804e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 1.0521, 'grad_norm': 5.651650905609131, 'learning_rate': 5.508369511177136e-07}[Rank 1] Trainer log: {'loss': 1.0521, 'grad_norm': 5.651650905609131, 'learning_rate': 5.508369511177136e-07} [Rank 0] Trainer log: {'loss': 1.0521, 'grad_norm': 5.651650905609131, 'learning_rate': 5.508369511177136e-07} [Rank 2] Trainer log: {'loss': 1.0521, 'grad_norm': 5.651650905609131, 'learning_rate': 5.508369511177136e-07} {'loss': 1.0521, 'grad_norm': 5.651650905609131, 'learning_rate': 5.508369511177136e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.8885, 'grad_norm': 3.7264742851257324, 'learning_rate': 5.486071510401658e-07}[Rank 2] Trainer log: {'loss': 0.8885, 'grad_norm': 3.7264742851257324, 'learning_rate': 5.486071510401658e-07} [Rank 1] Trainer log: {'loss': 0.8885, 'grad_norm': 3.7264742851257324, 'learning_rate': 5.486071510401658e-07} [Rank 0] Trainer log: {'loss': 0.8885, 'grad_norm': 3.7264742851257324, 'learning_rate': 5.486071510401658e-07} {'loss': 0.8885, 'grad_norm': 3.7264742851257324, 'learning_rate': 5.486071510401658e-07, 'epoch': 0.9} [Rank 0] Trainer log: {'loss': 0.8208, 'grad_norm': 6.894049644470215, 'learning_rate': 5.463817459650467e-07}[Rank 3] Trainer log: {'loss': 0.8208, 'grad_norm': 6.894049644470215, 'learning_rate': 5.463817459650467e-07}[Rank 1] Trainer log: {'loss': 0.8208, 'grad_norm': 6.894049644470215, 'learning_rate': 5.463817459650467e-07} [Rank 2] Trainer log: {'loss': 0.8208, 'grad_norm': 6.894049644470215, 'learning_rate': 5.463817459650467e-07} {'loss': 0.8208, 'grad_norm': 6.894049644470215, 'learning_rate': 5.463817459650467e-07, 'epoch': 0.9} [Rank 0] Trainer log: {'loss': 0.8222, 'grad_norm': 3.1531314849853516, 'learning_rate': 5.441607369271906e-07}[Rank 2] Trainer log: {'loss': 0.8222, 'grad_norm': 3.1531314849853516, 'learning_rate': 5.441607369271906e-07}[Rank 3] Trainer log: {'loss': 0.8222, 'grad_norm': 3.1531314849853516, 'learning_rate': 5.441607369271906e-07} [Rank 1] Trainer log: {'loss': 0.8222, 'grad_norm': 3.1531314849853516, 'learning_rate': 5.441607369271906e-07} {'loss': 0.8222, 'grad_norm': 3.1531314849853516, 'learning_rate': 5.441607369271906e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.8004, 'grad_norm': 5.507090091705322, 'learning_rate': 5.419441249593916e-07}[Rank 2] Trainer log: {'loss': 0.8004, 'grad_norm': 5.507090091705322, 'learning_rate': 5.419441249593916e-07} [Rank 0] Trainer log: {'loss': 0.8004, 'grad_norm': 5.507090091705322, 'learning_rate': 5.419441249593916e-07} [Rank 1] Trainer log: {'loss': 0.8004, 'grad_norm': 5.507090091705322, 'learning_rate': 5.419441249593916e-07} {'loss': 0.8004, 'grad_norm': 5.507090091705322, 'learning_rate': 5.419441249593916e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.8163, 'grad_norm': 4.2549591064453125, 'learning_rate': 5.397319110924016e-07}[Rank 0] Trainer log: {'loss': 0.8163, 'grad_norm': 4.2549591064453125, 'learning_rate': 5.397319110924016e-07}[Rank 1] Trainer log: {'loss': 0.8163, 'grad_norm': 4.2549591064453125, 'learning_rate': 5.397319110924016e-07} [Rank 2] Trainer log: {'loss': 0.8163, 'grad_norm': 4.2549591064453125, 'learning_rate': 5.397319110924016e-07} {'loss': 0.8163, 'grad_norm': 4.2549591064453125, 'learning_rate': 5.397319110924016e-07, 'epoch': 0.9} [Rank 0] Trainer log: {'loss': 0.725, 'grad_norm': 4.9870924949646, 'learning_rate': 5.375240963549211e-07}[Rank 3] Trainer log: {'loss': 0.725, 'grad_norm': 4.9870924949646, 'learning_rate': 5.375240963549211e-07} [Rank 2] Trainer log: {'loss': 0.725, 'grad_norm': 4.9870924949646, 'learning_rate': 5.375240963549211e-07} [Rank 1] Trainer log: {'loss': 0.725, 'grad_norm': 4.9870924949646, 'learning_rate': 5.375240963549211e-07} {'loss': 0.725, 'grad_norm': 4.9870924949646, 'learning_rate': 5.375240963549211e-07, 'epoch': 0.9} [Rank 1] Trainer log: {'loss': 0.7699, 'grad_norm': 7.060013771057129, 'learning_rate': 5.353206817736101e-07}[Rank 0] Trainer log: {'loss': 0.7699, 'grad_norm': 7.060013771057129, 'learning_rate': 5.353206817736101e-07}[Rank 3] Trainer log: {'loss': 0.7699, 'grad_norm': 7.060013771057129, 'learning_rate': 5.353206817736101e-07} [Rank 2] Trainer log: {'loss': 0.7699, 'grad_norm': 7.060013771057129, 'learning_rate': 5.353206817736101e-07} {'loss': 0.7699, 'grad_norm': 7.060013771057129, 'learning_rate': 5.353206817736101e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.7494, 'grad_norm': 3.444916248321533, 'learning_rate': 5.331216683730789e-07} [Rank 1] Trainer log: {'loss': 0.7494, 'grad_norm': 3.444916248321533, 'learning_rate': 5.331216683730789e-07} [Rank 0] Trainer log: {'loss': 0.7494, 'grad_norm': 3.444916248321533, 'learning_rate': 5.331216683730789e-07}[Rank 2] Trainer log: {'loss': 0.7494, 'grad_norm': 3.444916248321533, 'learning_rate': 5.331216683730789e-07} {'loss': 0.7494, 'grad_norm': 3.444916248321533, 'learning_rate': 5.331216683730789e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.7373, 'grad_norm': 3.4014649391174316, 'learning_rate': 5.309270571758951e-07} [Rank 0] Trainer log: {'loss': 0.7373, 'grad_norm': 3.4014649391174316, 'learning_rate': 5.309270571758951e-07}[Rank 2] Trainer log: {'loss': 0.7373, 'grad_norm': 3.4014649391174316, 'learning_rate': 5.309270571758951e-07} [Rank 1] Trainer log: {'loss': 0.7373, 'grad_norm': 3.4014649391174316, 'learning_rate': 5.309270571758951e-07} {'loss': 0.7373, 'grad_norm': 3.4014649391174316, 'learning_rate': 5.309270571758951e-07, 'epoch': 0.9} [Rank 1] Trainer log: {'loss': 0.7781, 'grad_norm': 5.809092998504639, 'learning_rate': 5.28736849202578e-07}[Rank 3] Trainer log: {'loss': 0.7781, 'grad_norm': 5.809092998504639, 'learning_rate': 5.28736849202578e-07}[Rank 0] Trainer log: {'loss': 0.7781, 'grad_norm': 5.809092998504639, 'learning_rate': 5.28736849202578e-07} [Rank 2] Trainer log: {'loss': 0.7781, 'grad_norm': 5.809092998504639, 'learning_rate': 5.28736849202578e-07} {'loss': 0.7781, 'grad_norm': 5.809092998504639, 'learning_rate': 5.28736849202578e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.9076, 'grad_norm': 2.3170580863952637, 'learning_rate': 5.265510454715961e-07} [Rank 0] Trainer log: {'loss': 0.9076, 'grad_norm': 2.3170580863952637, 'learning_rate': 5.265510454715961e-07}[Rank 1] Trainer log: {'loss': 0.9076, 'grad_norm': 2.3170580863952637, 'learning_rate': 5.265510454715961e-07} [Rank 2] Trainer log: {'loss': 0.9076, 'grad_norm': 2.3170580863952637, 'learning_rate': 5.265510454715961e-07} {'loss': 0.9076, 'grad_norm': 2.3170580863952637, 'learning_rate': 5.265510454715961e-07, 'epoch': 0.9} [Rank 2] Trainer log: {'loss': 0.9457, 'grad_norm': 6.048744201660156, 'learning_rate': 5.243696469993753e-07} [Rank 0] Trainer log: {'loss': 0.9457, 'grad_norm': 6.048744201660156, 'learning_rate': 5.243696469993753e-07}[Rank 3] Trainer log: {'loss': 0.9457, 'grad_norm': 6.048744201660156, 'learning_rate': 5.243696469993753e-07} [Rank 1] Trainer log: {'loss': 0.9457, 'grad_norm': 6.048744201660156, 'learning_rate': 5.243696469993753e-07} {'loss': 0.9457, 'grad_norm': 6.048744201660156, 'learning_rate': 5.243696469993753e-07, 'epoch': 0.9} [Rank 2] Trainer log: {'loss': 0.8399, 'grad_norm': 4.671056270599365, 'learning_rate': 5.221926548002876e-07} [Rank 0] Trainer log: {'loss': 0.8399, 'grad_norm': 4.671056270599365, 'learning_rate': 5.221926548002876e-07}[Rank 3] Trainer log: {'loss': 0.8399, 'grad_norm': 4.671056270599365, 'learning_rate': 5.221926548002876e-07} [Rank 1] Trainer log: {'loss': 0.8399, 'grad_norm': 4.671056270599365, 'learning_rate': 5.221926548002876e-07} {'loss': 0.8399, 'grad_norm': 4.671056270599365, 'learning_rate': 5.221926548002876e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.5527, 'grad_norm': 22.148271560668945, 'learning_rate': 5.200200698866587e-07}[Rank 1] Trainer log: {'loss': 0.5527, 'grad_norm': 22.148271560668945, 'learning_rate': 5.200200698866587e-07}[Rank 0] Trainer log: {'loss': 0.5527, 'grad_norm': 22.148271560668945, 'learning_rate': 5.200200698866587e-07} [Rank 2] Trainer log: {'loss': 0.5527, 'grad_norm': 22.148271560668945, 'learning_rate': 5.200200698866587e-07} {'loss': 0.5527, 'grad_norm': 22.148271560668945, 'learning_rate': 5.200200698866587e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 1.0388, 'grad_norm': 3.8115804195404053, 'learning_rate': 5.178518932687671e-07}[Rank 2] Trainer log: {'loss': 1.0388, 'grad_norm': 3.8115804195404053, 'learning_rate': 5.178518932687671e-07}[Rank 1] Trainer log: {'loss': 1.0388, 'grad_norm': 3.8115804195404053, 'learning_rate': 5.178518932687671e-07} [Rank 0] Trainer log: {'loss': 1.0388, 'grad_norm': 3.8115804195404053, 'learning_rate': 5.178518932687671e-07} {'loss': 1.0388, 'grad_norm': 3.8115804195404053, 'learning_rate': 5.178518932687671e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.8972, 'grad_norm': 2.9750633239746094, 'learning_rate': 5.156881259548363e-07}[Rank 2] Trainer log: {'loss': 0.8972, 'grad_norm': 2.9750633239746094, 'learning_rate': 5.156881259548363e-07}[Rank 1] Trainer log: {'loss': 0.8972, 'grad_norm': 2.9750633239746094, 'learning_rate': 5.156881259548363e-07} [Rank 0] Trainer log: {'loss': 0.8972, 'grad_norm': 2.9750633239746094, 'learning_rate': 5.156881259548363e-07} {'loss': 0.8972, 'grad_norm': 2.9750633239746094, 'learning_rate': 5.156881259548363e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.9015, 'grad_norm': 5.232257843017578, 'learning_rate': 5.135287689510415e-07}[Rank 1] Trainer log: {'loss': 0.9015, 'grad_norm': 5.232257843017578, 'learning_rate': 5.135287689510415e-07} [Rank 0] Trainer log: {'loss': 0.9015, 'grad_norm': 5.232257843017578, 'learning_rate': 5.135287689510415e-07}[Rank 2] Trainer log: {'loss': 0.9015, 'grad_norm': 5.232257843017578, 'learning_rate': 5.135287689510415e-07} {'loss': 0.9015, 'grad_norm': 5.232257843017578, 'learning_rate': 5.135287689510415e-07, 'epoch': 0.9} [Rank 2] Trainer log: {'loss': 0.9241, 'grad_norm': 4.551783084869385, 'learning_rate': 5.113738232615096e-07}[Rank 1] Trainer log: {'loss': 0.9241, 'grad_norm': 4.551783084869385, 'learning_rate': 5.113738232615096e-07}[Rank 3] Trainer log: {'loss': 0.9241, 'grad_norm': 4.551783084869385, 'learning_rate': 5.113738232615096e-07} [Rank 0] Trainer log: {'loss': 0.9241, 'grad_norm': 4.551783084869385, 'learning_rate': 5.113738232615096e-07} {'loss': 0.9241, 'grad_norm': 4.551783084869385, 'learning_rate': 5.113738232615096e-07, 'epoch': 0.9} [Rank 1] Trainer log: {'loss': 0.912, 'grad_norm': 20.157852172851562, 'learning_rate': 5.092232898883143e-07}[Rank 3] Trainer log: {'loss': 0.912, 'grad_norm': 20.157852172851562, 'learning_rate': 5.092232898883143e-07} [Rank 0] Trainer log: {'loss': 0.912, 'grad_norm': 20.157852172851562, 'learning_rate': 5.092232898883143e-07} [Rank 2] Trainer log: {'loss': 0.912, 'grad_norm': 20.157852172851562, 'learning_rate': 5.092232898883143e-07} {'loss': 0.912, 'grad_norm': 20.157852172851562, 'learning_rate': 5.092232898883143e-07, 'epoch': 0.9} [Rank 1] Trainer log: {'loss': 0.6931, 'grad_norm': 6.071656703948975, 'learning_rate': 5.070771698314758e-07}[Rank 3] Trainer log: {'loss': 0.6931, 'grad_norm': 6.071656703948975, 'learning_rate': 5.070771698314758e-07} [Rank 0] Trainer log: {'loss': 0.6931, 'grad_norm': 6.071656703948975, 'learning_rate': 5.070771698314758e-07}[Rank 2] Trainer log: {'loss': 0.6931, 'grad_norm': 6.071656703948975, 'learning_rate': 5.070771698314758e-07} {'loss': 0.6931, 'grad_norm': 6.071656703948975, 'learning_rate': 5.070771698314758e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.9125, 'grad_norm': 11.450650215148926, 'learning_rate': 5.04935464088967e-07} [Rank 0] Trainer log: {'loss': 0.9125, 'grad_norm': 11.450650215148926, 'learning_rate': 5.04935464088967e-07}[Rank 1] Trainer log: {'loss': 0.9125, 'grad_norm': 11.450650215148926, 'learning_rate': 5.04935464088967e-07} [Rank 2] Trainer log: {'loss': 0.9125, 'grad_norm': 11.450650215148926, 'learning_rate': 5.04935464088967e-07} {'loss': 0.9125, 'grad_norm': 11.450650215148926, 'learning_rate': 5.04935464088967e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.6252, 'grad_norm': 5.621769428253174, 'learning_rate': 5.027981736567012e-07}[Rank 1] Trainer log: {'loss': 0.6252, 'grad_norm': 5.621769428253174, 'learning_rate': 5.027981736567012e-07}[Rank 0] Trainer log: {'loss': 0.6252, 'grad_norm': 5.621769428253174, 'learning_rate': 5.027981736567012e-07} [Rank 2] Trainer log: {'loss': 0.6252, 'grad_norm': 5.621769428253174, 'learning_rate': 5.027981736567012e-07} {'loss': 0.6252, 'grad_norm': 5.621769428253174, 'learning_rate': 5.027981736567012e-07, 'epoch': 0.9} [Rank 0] Trainer log: {'loss': 0.7758, 'grad_norm': 2.5873305797576904, 'learning_rate': 5.006652995285433e-07}[Rank 1] Trainer log: {'loss': 0.7758, 'grad_norm': 2.5873305797576904, 'learning_rate': 5.006652995285433e-07}[Rank 3] Trainer log: {'loss': 0.7758, 'grad_norm': 2.5873305797576904, 'learning_rate': 5.006652995285433e-07} [Rank 2] Trainer log: {'loss': 0.7758, 'grad_norm': 2.5873305797576904, 'learning_rate': 5.006652995285433e-07} {'loss': 0.7758, 'grad_norm': 2.5873305797576904, 'learning_rate': 5.006652995285433e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.7091, 'grad_norm': 6.8832926750183105, 'learning_rate': 4.985368426963044e-07}[Rank 1] Trainer log: {'loss': 0.7091, 'grad_norm': 6.8832926750183105, 'learning_rate': 4.985368426963044e-07}[Rank 0] Trainer log: {'loss': 0.7091, 'grad_norm': 6.8832926750183105, 'learning_rate': 4.985368426963044e-07} [Rank 2] Trainer log: {'loss': 0.7091, 'grad_norm': 6.8832926750183105, 'learning_rate': 4.985368426963044e-07} {'loss': 0.7091, 'grad_norm': 6.8832926750183105, 'learning_rate': 4.985368426963044e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.8563, 'grad_norm': 6.846954822540283, 'learning_rate': 4.964128041497395e-07} [Rank 0] Trainer log: {'loss': 0.8563, 'grad_norm': 6.846954822540283, 'learning_rate': 4.964128041497395e-07}[Rank 1] Trainer log: {'loss': 0.8563, 'grad_norm': 6.846954822540283, 'learning_rate': 4.964128041497395e-07}[Rank 2] Trainer log: {'loss': 0.8563, 'grad_norm': 6.846954822540283, 'learning_rate': 4.964128041497395e-07} {'loss': 0.8563, 'grad_norm': 6.846954822540283, 'learning_rate': 4.964128041497395e-07, 'epoch': 0.9} [Rank 0] Trainer log: {'loss': 0.9188, 'grad_norm': 2.3758630752563477, 'learning_rate': 4.942931848765497e-07} [Rank 1] Trainer log: {'loss': 0.9188, 'grad_norm': 2.3758630752563477, 'learning_rate': 4.942931848765497e-07} [Rank 2] Trainer log: {'loss': 0.9188, 'grad_norm': 2.3758630752563477, 'learning_rate': 4.942931848765497e-07} [Rank 3] Trainer log: {'loss': 0.9188, 'grad_norm': 2.3758630752563477, 'learning_rate': 4.942931848765497e-07} {'loss': 0.9188, 'grad_norm': 2.3758630752563477, 'learning_rate': 4.942931848765497e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.8354, 'grad_norm': 6.890372276306152, 'learning_rate': 4.92177985862382e-07}[Rank 0] Trainer log: {'loss': 0.8354, 'grad_norm': 6.890372276306152, 'learning_rate': 4.92177985862382e-07}[Rank 2] Trainer log: {'loss': 0.8354, 'grad_norm': 6.890372276306152, 'learning_rate': 4.92177985862382e-07} [Rank 1] Trainer log: {'loss': 0.8354, 'grad_norm': 6.890372276306152, 'learning_rate': 4.92177985862382e-07} {'loss': 0.8354, 'grad_norm': 6.890372276306152, 'learning_rate': 4.92177985862382e-07, 'epoch': 0.9} [Rank 3] Trainer log: {'loss': 0.9419, 'grad_norm': 3.9403789043426514, 'learning_rate': 4.900672080908275e-07}[Rank 1] Trainer log: {'loss': 0.9419, 'grad_norm': 3.9403789043426514, 'learning_rate': 4.900672080908275e-07} [Rank 2] Trainer log: {'loss': 0.9419, 'grad_norm': 3.9403789043426514, 'learning_rate': 4.900672080908275e-07} [Rank 0] Trainer log: {'loss': 0.9419, 'grad_norm': 3.9403789043426514, 'learning_rate': 4.900672080908275e-07} {'loss': 0.9419, 'grad_norm': 3.9403789043426514, 'learning_rate': 4.900672080908275e-07, 'epoch': 0.9} [Rank 2] Trainer log: {'loss': 0.8959, 'grad_norm': 5.921530246734619, 'learning_rate': 4.87960852543421e-07} [Rank 3] Trainer log: {'loss': 0.8959, 'grad_norm': 5.921530246734619, 'learning_rate': 4.87960852543421e-07} [Rank 0] Trainer log: {'loss': 0.8959, 'grad_norm': 5.921530246734619, 'learning_rate': 4.87960852543421e-07}[Rank 1] Trainer log: {'loss': 0.8959, 'grad_norm': 5.921530246734619, 'learning_rate': 4.87960852543421e-07} {'loss': 0.8959, 'grad_norm': 5.921530246734619, 'learning_rate': 4.87960852543421e-07, 'epoch': 0.91} [Rank 3] Trainer log: {'loss': 1.055, 'grad_norm': 2.1771347522735596, 'learning_rate': 4.858589201996433e-07}[Rank 0] Trainer log: {'loss': 1.055, 'grad_norm': 2.1771347522735596, 'learning_rate': 4.858589201996433e-07} [Rank 1] Trainer log: {'loss': 1.055, 'grad_norm': 2.1771347522735596, 'learning_rate': 4.858589201996433e-07} [Rank 2] Trainer log: {'loss': 1.055, 'grad_norm': 2.1771347522735596, 'learning_rate': 4.858589201996433e-07} {'loss': 1.055, 'grad_norm': 2.1771347522735596, 'learning_rate': 4.858589201996433e-07, 'epoch': 0.91} [Rank 0] Trainer log: {'loss': 0.8018, 'grad_norm': 9.169463157653809, 'learning_rate': 4.837614120369128e-07} [Rank 3] Trainer log: {'loss': 0.8018, 'grad_norm': 9.169463157653809, 'learning_rate': 4.837614120369128e-07} [Rank 1] Trainer log: {'loss': 0.8018, 'grad_norm': 9.169463157653809, 'learning_rate': 4.837614120369128e-07} [Rank 2] Trainer log: {'loss': 0.8018, 'grad_norm': 9.169463157653809, 'learning_rate': 4.837614120369128e-07} {'loss': 0.8018, 'grad_norm': 9.169463157653809, 'learning_rate': 4.837614120369128e-07, 'epoch': 0.91} [Rank 1] Trainer log: {'loss': 0.7282, 'grad_norm': 2.8889811038970947, 'learning_rate': 4.816683290305968e-07}[Rank 0] Trainer log: {'loss': 0.7282, 'grad_norm': 2.8889811038970947, 'learning_rate': 4.816683290305968e-07} [Rank 3] Trainer log: {'loss': 0.7282, 'grad_norm': 2.8889811038970947, 'learning_rate': 4.816683290305968e-07} [Rank 2] Trainer log: {'loss': 0.7282, 'grad_norm': 2.8889811038970947, 'learning_rate': 4.816683290305968e-07} {'loss': 0.7282, 'grad_norm': 2.8889811038970947, 'learning_rate': 4.816683290305968e-07, 'epoch': 0.91} [Rank 0] Trainer log: {'loss': 0.5659, 'grad_norm': 1.881841778755188, 'learning_rate': 4.79579672153998e-07}[Rank 3] Trainer log: {'loss': 0.5659, 'grad_norm': 1.881841778755188, 'learning_rate': 4.79579672153998e-07} [Rank 1] Trainer log: {'loss': 0.5659, 'grad_norm': 1.881841778755188, 'learning_rate': 4.79579672153998e-07} [Rank 2] Trainer log: {'loss': 0.5659, 'grad_norm': 1.881841778755188, 'learning_rate': 4.79579672153998e-07} {'loss': 0.5659, 'grad_norm': 1.881841778755188, 'learning_rate': 4.79579672153998e-07, 'epoch': 0.91} [Rank 0] Trainer log: {'loss': 0.8104, 'grad_norm': 3.9715282917022705, 'learning_rate': 4.774954423783706e-07}[Rank 3] Trainer log: {'loss': 0.8104, 'grad_norm': 3.9715282917022705, 'learning_rate': 4.774954423783706e-07}[Rank 1] Trainer log: {'loss': 0.8104, 'grad_norm': 3.9715282917022705, 'learning_rate': 4.774954423783706e-07} [Rank 2] Trainer log: {'loss': 0.8104, 'grad_norm': 3.9715282917022705, 'learning_rate': 4.774954423783706e-07} {'loss': 0.8104, 'grad_norm': 3.9715282917022705, 'learning_rate': 4.774954423783706e-07, 'epoch': 0.91} [Rank 3] Trainer log: {'loss': 0.7233, 'grad_norm': 6.943088054656982, 'learning_rate': 4.7541564067290046e-07} [Rank 0] Trainer log: {'loss': 0.7233, 'grad_norm': 6.943088054656982, 'learning_rate': 4.7541564067290046e-07}[Rank 2] Trainer log: {'loss': 0.7233, 'grad_norm': 6.943088054656982, 'learning_rate': 4.7541564067290046e-07} [Rank 1] Trainer log: {'loss': 0.7233, 'grad_norm': 6.943088054656982, 'learning_rate': 4.7541564067290046e-07} {'loss': 0.7233, 'grad_norm': 6.943088054656982, 'learning_rate': 4.7541564067290046e-07, 'epoch': 0.91} [Rank 1] Trainer log: {'loss': 0.7962, 'grad_norm': 6.085477828979492, 'learning_rate': 4.7334026800471945e-07} [Rank 0] Trainer log: {'loss': 0.7962, 'grad_norm': 6.085477828979492, 'learning_rate': 4.7334026800471945e-07}[Rank 3] Trainer log: {'loss': 0.7962, 'grad_norm': 6.085477828979492, 'learning_rate': 4.7334026800471945e-07} [Rank 2] Trainer log: {'loss': 0.7962, 'grad_norm': 6.085477828979492, 'learning_rate': 4.7334026800471945e-07} {'loss': 0.7962, 'grad_norm': 6.085477828979492, 'learning_rate': 4.7334026800471945e-07, 'epoch': 0.91} [Rank 1] Trainer log: {'loss': 0.6936, 'grad_norm': 3.458329677581787, 'learning_rate': 4.712693253389e-07}[Rank 3] Trainer log: {'loss': 0.6936, 'grad_norm': 3.458329677581787, 'learning_rate': 4.712693253389e-07}[Rank 0] Trainer log: {'loss': 0.6936, 'grad_norm': 3.458329677581787, 'learning_rate': 4.712693253389e-07} [Rank 2] Trainer log: {'loss': 0.6936, 'grad_norm': 3.458329677581787, 'learning_rate': 4.712693253389e-07} {'loss': 0.6936, 'grad_norm': 3.458329677581787, 'learning_rate': 4.712693253389e-07, 'epoch': 0.91} [Rank 1] Trainer log: {'loss': 0.6519, 'grad_norm': 4.977847099304199, 'learning_rate': 4.6920281363845297e-07} [Rank 0] Trainer log: {'loss': 0.6519, 'grad_norm': 4.977847099304199, 'learning_rate': 4.6920281363845297e-07}[Rank 3] Trainer log: {'loss': 0.6519, 'grad_norm': 4.977847099304199, 'learning_rate': 4.6920281363845297e-07} [Rank 2] Trainer log: {'loss': 0.6519, 'grad_norm': 4.977847099304199, 'learning_rate': 4.6920281363845297e-07} {'loss': 0.6519, 'grad_norm': 4.977847099304199, 'learning_rate': 4.6920281363845297e-07, 'epoch': 0.91} [Rank 3] Trainer log: {'loss': 0.9911, 'grad_norm': 5.911740303039551, 'learning_rate': 4.6714073386432745e-07}[Rank 0] Trainer log: {'loss': 0.9911, 'grad_norm': 5.911740303039551, 'learning_rate': 4.6714073386432745e-07}[Rank 2] Trainer log: {'loss': 0.9911, 'grad_norm': 5.911740303039551, 'learning_rate': 4.6714073386432745e-07} [Rank 1] Trainer log: {'loss': 0.9911, 'grad_norm': 5.911740303039551, 'learning_rate': 4.6714073386432745e-07} {'loss': 0.9911, 'grad_norm': 5.911740303039551, 'learning_rate': 4.6714073386432745e-07, 'epoch': 0.91} [Rank 2] Trainer log: {'loss': 1.0587, 'grad_norm': 3.858769416809082, 'learning_rate': 4.6508308697541525e-07}[Rank 1] Trainer log: {'loss': 1.0587, 'grad_norm': 3.858769416809082, 'learning_rate': 4.6508308697541525e-07}[Rank 3] Trainer log: {'loss': 1.0587, 'grad_norm': 3.858769416809082, 'learning_rate': 4.6508308697541525e-07} [Rank 0] Trainer log: {'loss': 1.0587, 'grad_norm': 3.858769416809082, 'learning_rate': 4.6508308697541525e-07} {'loss': 1.0587, 'grad_norm': 3.858769416809082, 'learning_rate': 4.6508308697541525e-07, 'epoch': 0.91} [Rank 3] Trainer log: {'loss': 0.863, 'grad_norm': 2.815195083618164, 'learning_rate': 4.6302987392854547e-07}[Rank 0] Trainer log: {'loss': 0.863, 'grad_norm': 2.815195083618164, 'learning_rate': 4.6302987392854547e-07} [Rank 1] Trainer log: {'loss': 0.863, 'grad_norm': 2.815195083618164, 'learning_rate': 4.6302987392854547e-07} [Rank 2] Trainer log: {'loss': 0.863, 'grad_norm': 2.815195083618164, 'learning_rate': 4.6302987392854547e-07} {'loss': 0.863, 'grad_norm': 2.815195083618164, 'learning_rate': 4.6302987392854547e-07, 'epoch': 0.91}