[2025-04-17 14:51:53,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-04-17 14:51:53,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-04-17 14:51:53,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-04-17 14:51:53,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) Global rank 3, Local Rank: 3 initiated Global rank 2, Local Rank: 2 initiated [2025-04-17 14:51:55,503] [INFO] [comm.py:652:init_distributed] cdb=None [2025-04-17 14:51:55,507] [INFO] [comm.py:652:init_distributed] cdb=None Global rank 1, Local Rank: 1 initiated [2025-04-17 14:51:55,518] [INFO] [comm.py:652:init_distributed] cdb=None Global rank 0, Local Rank: 0 initiated [2025-04-17 14:51:55,550] [INFO] [comm.py:652:init_distributed] cdb=None [2025-04-17 14:51:55,550] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl GPU 0 - Using device: cuda GPU 3 - Using device: cuda GPU 2 - Using device: cuda GPU 1 - Using device: cuda Wandb initialized Rank 0: Loading vision tower: google/siglip-so400m-patch14-384 using lm_loss_weight: 0.2, video_loss_weight: 1, info_loss_weight: 0.5, ref_loss_weight: 6.0, uncertainty_loss_weight: 0.3, and tv_loss_weight: 0.05 for training creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|gate_proj)$', lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|gate_proj)$', lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|gate_proj)$', lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|gate_proj)$', lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) trainable params: 1,132,472,320 || all params: 8,600,842,784 || trainable%: 13.16699244993431 [Rank 2] Distributed initialized? True [Rank 2] Backend: nccl load datasets/coin/videos_metadata.json... trainable params: 1,132,472,320 || all params: 8,600,842,784 || trainable%: 13.16699244993431 [Rank 1] Distributed initialized? True [Rank 1] Backend: nccl load datasets/coin/videos_metadata.json... trainable params: 1,132,472,320 || all params: 8,600,842,784 || trainable%: 13.16699244993431 [Rank 3] Distributed initialized? True [Rank 3] Backend: nccl load datasets/coin/videos_metadata.json... trainable params: 1,132,472,320 || all params: 8,600,842,784 || trainable%: 13.16699244993431 ('base_model.model.model.image_newline', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.embed_tokens.weight', torch.Size([152064, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.0.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.0.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.0.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.1.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.1.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.1.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.1.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.2.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.2.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.2.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.2.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.3.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.3.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.3.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.3.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.4.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.4.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.4.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.4.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.5.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.5.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.5.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.5.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.6.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.6.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.6.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.6.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.7.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.7.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.7.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.7.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.8.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.8.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.8.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.8.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.9.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.9.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.9.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.9.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.10.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.10.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.10.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.10.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.11.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.11.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.11.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.11.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.12.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.12.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.12.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.12.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.13.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.13.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.13.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.13.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.14.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.14.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.14.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.14.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.15.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.15.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.15.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.15.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.16.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.16.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.16.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.17.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.17.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.17.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.17.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.18.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.18.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.18.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.18.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.19.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.19.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.19.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.19.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.20.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.20.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.20.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.20.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.21.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.21.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.21.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.21.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.22.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.22.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.22.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.22.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.23.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.23.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.23.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.23.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.24.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.24.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.24.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.24.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.24.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.25.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.25.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.25.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.25.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.25.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.26.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.26.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.26.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.26.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.26.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 8]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) ('base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 8]), torch.float32, True) ('base_model.model.model.layers.27.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.mlp.gate_proj.lora_A.default.weight', torch.Size([8, 3584]), torch.float32, True) ('base_model.model.model.layers.27.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 8]), torch.float32, True) ('base_model.model.model.layers.27.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.27.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.27.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.norm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', torch.Size([1152, 3, 14, 14]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', torch.Size([729, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.post_layernorm.weight', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.vision_tower.vision_tower.vision_model.post_layernorm.bias', torch.Size([1152]), torch.bfloat16, False) ('base_model.model.model.mm_projector.original_module.0.weight', torch.Size([3584, 1152]), torch.bfloat16, True) ('base_model.model.model.mm_projector.original_module.0.bias', torch.Size([3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.original_module.2.weight', torch.Size([3584, 3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.original_module.2.bias', torch.Size([3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.modules_to_save.default.0.weight', torch.Size([3584, 1152]), torch.bfloat16, True) ('base_model.model.model.mm_projector.modules_to_save.default.0.bias', torch.Size([3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.modules_to_save.default.2.weight', torch.Size([3584, 3584]), torch.bfloat16, True) ('base_model.model.model.mm_projector.modules_to_save.default.2.bias', torch.Size([3584]), torch.bfloat16, True) ('base_model.model.lm_head.original_module.weight', torch.Size([152064, 3584]), torch.bfloat16, True) ('base_model.model.lm_head.modules_to_save.default.weight', torch.Size([152064, 3584]), torch.bfloat16, True) ('base_model.model.informative_head.original_module.weight', torch.Size([2, 3584]), torch.bfloat16, True) ('base_model.model.informative_head.modules_to_save.default.weight', torch.Size([2, 3584]), torch.bfloat16, True) ('base_model.model.relevance_head.original_module.weight', torch.Size([1, 3584]), torch.bfloat16, True) ('base_model.model.relevance_head.modules_to_save.default.weight', torch.Size([1, 3584]), torch.bfloat16, True) ('base_model.model.uncertainty_head.original_module.weight', torch.Size([1, 3584]), torch.bfloat16, True) ('base_model.model.uncertainty_head.modules_to_save.default.weight', torch.Size([1, 3584]), torch.bfloat16, True) [Rank 0] Distributed initialized? True [Rank 0] Backend: nccl load datasets/coin/videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHey assistant, do you know the current video content? Reply me concisely.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30872, 30923), range(31662, 31695)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nDo concise real-time narration.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30830, 30881), range(31620, 31653)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat can you tell me about? Be concise.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30838, 30889), range(31628, 31661)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nContinuously answer what you observed with simple text.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30854, 30905), range(31644, 31677)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset MAGQAStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] Dataset MAGQAStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] Dataset MAGQAStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... load datasets/shot2story/release_134k_videos_metadata.json... load datasets/shot2story/release_134k_videos_metadata.json... Dataset MAGQAStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nUse simple text to explain what is shown in front of me.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1700, 1807), range(5290, 5534), range(9017, 9359), range(13871, 14272)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] Dataset DenseVideoCaptioningStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease simply describe what do you see.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] Dataset DenseVideoCaptioningStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHey assistant, do you know the current video content? Reply me concisely.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1717, 1824), range(5307, 5551), range(9034, 9376), range(13888, 14289)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] load datasets/hisum/videos_metadata.json... load datasets/hisum/videos_metadata.json... load datasets/hisum/videos_metadata.json... Dataset DenseVideoCaptioningStreamDataset has 12000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHelp me to illustrate my view in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] load datasets/hisum/videos_metadata.json... Mr. HiSum loaded 10388 out of 12000 videos Mr. HiSum loaded 10388 out of 12000 videos Mr. HiSum loaded 10388 out of 12000 videos Mr. HiSum loaded 10388 out of 12000 videos Dataset HiSumDataset has 10298 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nCustomized Yamaha MT-01 akrapovic HD (Without DB-killer)<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>', torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nAt what timestamp can I find information about 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)' in the video?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] Starting Training! Resuming from checkpoint: outputs/aha/checkpoint-2825 Starting Training! Resuming from checkpoint: outputs/aha/checkpoint-2825 Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhich clips in the video relate to the query 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] Starting Training! Resuming from checkpoint: outputs/aha/checkpoint-2825 Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat parts are relevant to the concept of 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] Starting Training! Resuming from checkpoint: outputs/aha/checkpoint-2825 ninja: no work to do. Time to load cpu_adam op: 2.2662734985351562 seconds Time to load cpu_adam op: 2.294476270675659 seconds ninja: no work to do. Time to load cpu_adam op: 2.2368359565734863 seconds ninja: no work to do. Time to load cpu_adam op: 2.2494730949401855 seconds [Rank 0] Trainer log: {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06}[Rank 2] Trainer log: {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06}[Rank 3] Trainer log: {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06} [Rank 1] Trainer log: {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06} {'loss': 0.8192, 'grad_norm': 4.078673362731934, 'learning_rate': 8.105559973518905e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06}[Rank 3] Trainer log: {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06}[Rank 1] Trainer log: {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06} [Rank 0] Trainer log: {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06} {'loss': 0.8987, 'grad_norm': 2.500500202178955, 'learning_rate': 8.098864727059685e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06}[Rank 1] Trainer log: {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06}[Rank 3] Trainer log: {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06} [Rank 0] Trainer log: {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06} {'loss': 0.9454, 'grad_norm': 3.9836504459381104, 'learning_rate': 8.092170364649432e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06}[Rank 3] Trainer log: {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06}[Rank 1] Trainer log: {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06} [Rank 0] Trainer log: {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06} {'loss': 0.8776, 'grad_norm': 3.095024347305298, 'learning_rate': 8.085476889401108e-06, 'epoch': 0.58} [Rank 3] Trainer log: {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06}[Rank 2] Trainer log: {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06}[Rank 1] Trainer log: {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06} [Rank 0] Trainer log: {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06} {'loss': 0.7777, 'grad_norm': 5.956109523773193, 'learning_rate': 8.078784304427249e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06}[Rank 1] Trainer log: {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06} [Rank 3] Trainer log: {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06} [Rank 0] Trainer log: {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06} {'loss': 0.8659, 'grad_norm': 10.085647583007812, 'learning_rate': 8.072092612839979e-06, 'epoch': 0.58} [Rank 3] Trainer log: {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06} [Rank 0] Trainer log: {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06}[Rank 1] Trainer log: {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06} [Rank 2] Trainer log: {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06} {'loss': 0.9599, 'grad_norm': 2.4025628566741943, 'learning_rate': 8.065401817751015e-06, 'epoch': 0.58} [Rank 1] Trainer log: {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06}[Rank 0] Trainer log: {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06} [Rank 2] Trainer log: {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06} [Rank 3] Trainer log: {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06} {'loss': 1.0365, 'grad_norm': 2.845998525619507, 'learning_rate': 8.058711922271646e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06}[Rank 3] Trainer log: {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06} [Rank 1] Trainer log: {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06} [Rank 0] Trainer log: {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06} {'loss': 0.7633, 'grad_norm': 14.627035140991211, 'learning_rate': 8.052022929512747e-06, 'epoch': 0.58} [Rank 2] Trainer log: {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06}[Rank 3] Trainer log: {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06} [Rank 1] Trainer log: {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06} [Rank 0] Trainer log: {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06} {'loss': 0.7598, 'grad_norm': 4.433148384094238, 'learning_rate': 8.045334842584776e-06, 'epoch': 0.58} [Rank 3] Trainer log: {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06}[Rank 1] Trainer log: {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06}[Rank 2] Trainer log: {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06} [Rank 0] Trainer log: {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06} {'loss': 0.718, 'grad_norm': 7.4574174880981445, 'learning_rate': 8.03864766459777e-06, 'epoch': 0.58} [Rank 1] Trainer log: {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06}[Rank 3] Trainer log: {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06}[Rank 2] Trainer log: {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06} [Rank 0] Trainer log: {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06} {'loss': 0.8095, 'grad_norm': 4.656423091888428, 'learning_rate': 8.031961398661334e-06, 'epoch': 0.58} [Rank 1] Trainer log: {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06}[Rank 2] Trainer log: {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06}[Rank 3] Trainer log: {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06} [Rank 0] Trainer log: {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06} {'loss': 0.5886, 'grad_norm': 11.168963432312012, 'learning_rate': 8.025276047884663e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06}[Rank 1] Trainer log: {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06}[Rank 3] Trainer log: {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06} [Rank 0] Trainer log: {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06} {'loss': 0.8638, 'grad_norm': 6.276289463043213, 'learning_rate': 8.018591615376512e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06}[Rank 1] Trainer log: {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06}[Rank 3] Trainer log: {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06} [Rank 0] Trainer log: {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06} {'loss': 0.6845, 'grad_norm': 11.260709762573242, 'learning_rate': 8.011908104245221e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06}[Rank 3] Trainer log: {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06}[Rank 1] Trainer log: {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06} [Rank 0] Trainer log: {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06} {'loss': 0.955, 'grad_norm': 9.418429374694824, 'learning_rate': 8.005225517598699e-06, 'epoch': 0.59} [Rank 0] Trainer log: {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06}[Rank 3] Trainer log: {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06} [Rank 2] Trainer log: {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06}[Rank 1] Trainer log: {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06} {'loss': 0.8492, 'grad_norm': 2.7920544147491455, 'learning_rate': 7.99854385854442e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06}[Rank 3] Trainer log: {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06}[Rank 0] Trainer log: {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06} [Rank 2] Trainer log: {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06} {'loss': 0.7462, 'grad_norm': 5.239140510559082, 'learning_rate': 7.991863130189424e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06} [Rank 2] Trainer log: {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06} [Rank 3] Trainer log: {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06} [Rank 0] Trainer log: {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06} {'loss': 1.0114, 'grad_norm': 2.067471742630005, 'learning_rate': 7.985183335640332e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06}[Rank 0] Trainer log: {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06}[Rank 2] Trainer log: {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06} [Rank 3] Trainer log: {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06} {'loss': 0.9044, 'grad_norm': 2.2515106201171875, 'learning_rate': 7.978504478003322e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06}[Rank 1] Trainer log: {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06} [Rank 2] Trainer log: {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06} [Rank 0] Trainer log: {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06} {'loss': 0.3635, 'grad_norm': 3.6975674629211426, 'learning_rate': 7.971826560384128e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06}[Rank 3] Trainer log: {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06}[Rank 1] Trainer log: {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06} [Rank 0] Trainer log: {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06} {'loss': 0.9571, 'grad_norm': 4.491323471069336, 'learning_rate': 7.965149585888068e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06}[Rank 2] Trainer log: {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06}[Rank 3] Trainer log: {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06} [Rank 0] Trainer log: {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06} {'loss': 0.6083, 'grad_norm': 17.75795555114746, 'learning_rate': 7.95847355762e-06, 'epoch': 0.59} [Rank 3] Trainer log: {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06}[Rank 2] Trainer log: {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06}[Rank 0] Trainer log: {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06} [Rank 1] Trainer log: {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06} {'loss': 0.8458, 'grad_norm': 5.3465895652771, 'learning_rate': 7.951798478684356e-06, 'epoch': 0.59} [Rank 1] Trainer log: {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06} [Rank 2] Trainer log: {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06} [Rank 3] Trainer log: {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06} [Rank 0] Trainer log: {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06} {'loss': 0.9166, 'grad_norm': 2.1329290866851807, 'learning_rate': 7.945124352185117e-06, 'epoch': 0.59} [Rank 2] Trainer log: {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06}[Rank 3] Trainer log: {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06}[Rank 1] Trainer log: {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06} [Rank 0] Trainer log: {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06} {'loss': 0.6711, 'grad_norm': 6.8279218673706055, 'learning_rate': 7.938451181225833e-06, 'epoch': 0.59}