diff --git "a/train.log" "b/train.log" --- "a/train.log" +++ "b/train.log" @@ -1,1352 +1,1207 @@ +[2025-04-08 05:57:48,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-08 05:57:50,635] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2025-04-08 05:57:50,636] [INFO] [runner.py:607:main] cmd = /home/paperspace/miniconda3/envs/aha/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29506 --enable_each_rank_log=None train.py --deepspeed configs/deepspeed/zero2offload.json --bf16 true --tf32 true --dataset_config configs/datasets/paperspace_configuration.json --llm_pretrained lmms-lab/llava-onevision-qwen2-7b-ov --num_train_epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 8 --gradient_checkpointing true --evaluation_strategy no --prediction_loss_only false --save_strategy steps --save_steps 500 --save_total_limit 5 --learning_rate 0.00002 --optim adamw_torch --lr_scheduler_type cosine --warmup_ratio 0.05 --dataloader_num_workers 4 --logging_steps 10 --report_to wandb --output_dir outputs/aha +[2025-04-08 05:57:51,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-08 05:57:53,577] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]} +[2025-04-08 05:57:53,577] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0 +[2025-04-08 05:57:53,577] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1]}) +[2025-04-08 05:57:53,577] [INFO] [launch.py:164:main] dist_world_size=2 +[2025-04-08 05:57:53,577] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1 +[2025-04-08 05:57:53,578] [INFO] [launch.py:256:main] process 15420 spawned with command: ['/home/paperspace/miniconda3/envs/aha/bin/python', '-u', 'train.py', '--local_rank=0', '--deepspeed', 'configs/deepspeed/zero2offload.json', '--bf16', 'true', '--tf32', 'true', '--dataset_config', 'configs/datasets/paperspace_configuration.json', '--llm_pretrained', 'lmms-lab/llava-onevision-qwen2-7b-ov', '--num_train_epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', 'true', '--evaluation_strategy', 'no', '--prediction_loss_only', 'false', '--save_strategy', 'steps', '--save_steps', '500', '--save_total_limit', '5', '--learning_rate', '0.00002', '--optim', 'adamw_torch', '--lr_scheduler_type', 'cosine', '--warmup_ratio', '0.05', '--dataloader_num_workers', '4', '--logging_steps', '10', '--report_to', 'wandb', '--output_dir', 'outputs/aha'] +[2025-04-08 05:57:53,578] [INFO] [launch.py:256:main] process 15421 spawned with command: ['/home/paperspace/miniconda3/envs/aha/bin/python', '-u', 'train.py', '--local_rank=1', '--deepspeed', 'configs/deepspeed/zero2offload.json', '--bf16', 'true', '--tf32', 'true', '--dataset_config', 'configs/datasets/paperspace_configuration.json', '--llm_pretrained', 'lmms-lab/llava-onevision-qwen2-7b-ov', '--num_train_epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', 'true', '--evaluation_strategy', 'no', '--prediction_loss_only', 'false', '--save_strategy', 'steps', '--save_steps', '500', '--save_total_limit', '5', '--learning_rate', '0.00002', '--optim', 'adamw_torch', '--lr_scheduler_type', 'cosine', '--warmup_ratio', '0.05', '--dataloader_num_workers', '4', '--logging_steps', '10', '--report_to', 'wandb', '--output_dir', 'outputs/aha'] +[2025-04-08 05:57:56,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-08 05:57:56,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) Global rank 0, Local Rank: 0 initiated +[2025-04-08 05:57:58,331] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-08 05:57:58,331] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl Global rank 1, Local Rank: 1 initiated -[2025-04-05 03:17:31,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2025-04-05 03:17:31,962] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2025-04-05 03:17:32,779] [INFO] [comm.py:652:init_distributed] cdb=None -[2025-04-05 03:17:32,779] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl -[2025-04-05 03:17:32,784] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-08 05:57:58,371] [INFO] [comm.py:652:init_distributed] cdb=None GPU 0 - Using device: cuda GPU 1 - Using device: cuda -LiveTrainingArguments( -_n_gpu=1, -accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, -adafactor=False, -adam_beta1=0.9, -adam_beta2=0.999, -adam_epsilon=1e-08, -attn_implementation=flash_attention_2, -augmentation=False, -auto_find_batch_size=False, -bf16=True, -bf16_full_eval=False, -data_seed=None, -dataloader_drop_last=False, -dataloader_num_workers=4, -dataloader_persistent_workers=False, -dataloader_pin_memory=True, -dataloader_prefetch_factor=None, -dataset_config=configs/datasets/paperspace_configuration.json, -ddp_backend=None, -ddp_broadcast_buffers=None, -ddp_bucket_cap_mb=None, -ddp_find_unused_parameters=None, -ddp_timeout=1800, -debug=[], -deepspeed=configs/deepspeed/zero2.json, -disable_tqdm=False, -dispatch_batches=None, -do_eval=False, -do_predict=False, -do_train=False, -embed_mark=2fps_384_1+3x3, -eval_accumulation_steps=None, -eval_delay=0, -eval_steps=None, -evaluation_strategy=no, -finetune_modules=['connector', 'mm_projector', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], -first_n_frames_no_generate=0, -fp16=False, -fp16_backend=auto, -fp16_full_eval=False, -fp16_opt_level=O1, -frame_fps=2, -frame_num_tokens=49, -frame_resolution=384, -frame_token_cls=False, -frame_token_pooled=[7, 7], -fsdp=[], -fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, -fsdp_min_num_params=0, -fsdp_transformer_layer_cls_to_wrap=None, -full_determinism=False, -gradient_accumulation_steps=16, -gradient_checkpointing=True, -gradient_checkpointing_kwargs=None, -greater_is_better=None, -grounding_mode=False, -group_by_length=False, -half_precision_backend=auto, -hub_always_push=False, -hub_model_id=None, -hub_private_repo=False, -hub_strategy=every_save, -hub_token=, -ignore_data_skip=False, -include_inputs_for_metrics=False, -include_num_input_tokens_seen=False, -include_tokens_per_second=False, -input_dir=dataset/tvsum/ydata-tvsum50-v1_1/video, -jit_mode_eval=False, -label_names=None, -label_smoothing_factor=0.0, -learning_rate=2e-05, -length_column_name=length, -live_version=live1+, -llm_pretrained=lmms-lab/llava-onevision-qwen2-7b-ov, -load_best_model_at_end=False, -local_rank=0, -log_level=passive, -log_level_replica=warning, -log_on_each_node=True, -logging_dir=outputs/aha/runs/Apr05_03-17-31_psdppq1rwe2a, -logging_first_step=False, -logging_nan_inf_filter=True, -logging_steps=10, -logging_strategy=steps, -lora_alpha=32, -lora_modules=model\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$, -lora_pretrained=None, -lora_r=16, -lr_scheduler_kwargs={}, -lr_scheduler_type=cosine, -max_grad_norm=1.0, -max_num_frames=100, -max_steps=-1, -metric_for_best_model=None, -mp_parameters=, -neftune_noise_alpha=None, -no_cuda=False, -num_train_epochs=1.0, -optim=adamw_torch, -optim_args=None, -optim_target_modules=None, -output_dir=outputs/aha, -overwrite_output_dir=False, -past_index=-1, -per_device_eval_batch_size=1, -per_device_train_batch_size=1, -prediction_loss_only=False, -push_to_hub=True, -push_to_hub_model_id=None, -push_to_hub_organization=None, -push_to_hub_token=, -quantization=False, -ray_scope=last, -remove_unused_columns=True, -report_to=['wandb'], -resume_from_checkpoint=None, -run_name=outputs/aha, -save_on_each_node=False, -save_only_model=False, -save_safetensors=True, -save_steps=500, -save_strategy=steps, -save_total_limit=5, -seed=42, -skip_memory_metrics=True, -split_batches=None, -stream_loss_weight=1.0, -tf32=True, -torch_compile=False, -torch_compile_backend=None, -torch_compile_mode=None, -torchdynamo=None, -tpu_metrics_debug=False, -tpu_num_cores=None, -use_cache=False, -use_cpu=False, -use_ipex=False, -use_legacy_prediction_loop=False, -use_mps_device=False, -v_placeholder=, -video_pooling_stride=4, -vision_pretrained=google/siglip-large-patch16-384, -warmup_ratio=0.05, -warmup_steps=0, -weight_decay=0.0, -) -using lm_loss_weight: 0.5, video_loss_weight: 1, info_loss_weight: 2.0, ref_loss_weight: 1.0, uncertainty_loss_weight: 0.2, and tv_loss_weight: 0.1 for training Rank 0: Loading vision tower: google/siglip-so400m-patch14-384 -using lm_loss_weight: 0.5, video_loss_weight: 1, info_loss_weight: 2.0, ref_loss_weight: 1.0, uncertainty_loss_weight: 0.2, and tv_loss_weight: 0.1 for training -creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) -creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) -trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 -freezing ViT +using lm_loss_weight: 0.5, video_loss_weight: 1, info_loss_weight: 2.0, ref_loss_weight: 1.0, uncertainty_loss_weight: 0.2, and tv_loss_weight: 0.1 for training +creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None) +creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None) +trainable params: 40,370,176 || all params: 8,070,733,344 || trainable%: 0.500204557371633 load datasets/coin/videos_metadata.json... -trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 -freezing ViT -('base_model.model.model.image_newline', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.embed_tokens.weight', torch.Size([152064, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.0.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.0.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.0.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.0.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.0.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.0.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.0.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.0.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.1.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.1.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.1.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.1.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.1.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.1.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.1.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.1.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.2.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.2.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.2.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.2.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.2.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.2.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.2.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.2.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.3.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.3.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.3.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.3.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.3.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.3.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.3.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.3.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.4.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.4.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.4.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.4.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.4.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.4.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.4.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.4.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.5.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.5.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.5.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.5.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.5.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.5.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.5.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.5.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.6.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.6.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.6.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.6.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.6.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.6.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.6.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.6.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.7.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.7.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.7.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.7.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.7.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.7.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.7.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.7.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.8.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.8.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.8.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.8.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.8.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.8.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.8.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.8.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.9.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.9.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.9.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.9.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.9.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.9.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.9.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.9.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.10.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.10.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.10.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.10.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.10.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.10.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.10.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.10.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.11.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.11.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.11.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.11.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.11.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.11.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.11.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.11.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.12.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.12.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.12.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.12.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.12.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.12.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.12.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.12.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.13.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.13.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.13.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.13.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.13.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.13.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.13.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.13.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.14.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.14.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.14.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.14.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.14.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.14.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.14.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.14.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.15.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.15.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.15.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.15.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.15.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.15.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.15.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.15.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.16.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.16.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.16.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.16.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.16.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.16.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.16.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.16.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.17.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.17.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.17.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.17.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.17.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.17.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.17.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.17.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.18.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.18.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.18.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.18.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.18.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.18.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.18.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.18.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.19.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.19.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.19.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.19.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.19.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.19.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.19.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.19.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.20.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.20.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.20.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.20.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.20.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.20.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.20.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.20.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.21.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.21.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.21.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.21.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.21.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.21.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.21.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.21.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.22.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.22.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.22.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.22.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.22.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.22.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.22.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.22.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.23.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.23.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.23.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.23.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.23.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.23.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.23.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.23.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.24.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.24.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.24.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.24.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.24.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.24.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.24.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.24.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.24.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.24.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.24.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.24.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.24.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.25.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.25.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.25.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.25.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.25.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.25.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.25.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.25.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.25.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.25.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.25.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.25.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.25.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.26.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.26.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.26.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.26.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.26.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.26.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.26.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.26.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.26.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.26.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.26.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.26.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.26.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.self_attn.q_proj.bias', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.27.self_attn.k_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.self_attn.k_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.27.self_attn.v_proj.weight', torch.Size([512, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.self_attn.v_proj.bias', torch.Size([512]), torch.bfloat16, False) -('base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.weight', torch.Size([512, 16]), torch.float32, True) -('base_model.model.model.layers.27.self_attn.o_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.self_attn.o_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.27.self_attn.o_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.27.mlp.gate_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.mlp.gate_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.27.mlp.gate_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.27.mlp.up_proj.weight', torch.Size([18944, 3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.mlp.up_proj.lora_A.default.weight', torch.Size([16, 3584]), torch.float32, True) -('base_model.model.model.layers.27.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) -('base_model.model.model.layers.27.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) -('base_model.model.model.layers.27.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) -('base_model.model.model.layers.27.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) -('base_model.model.model.layers.27.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.layers.27.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.norm.weight', torch.Size([3584]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', torch.Size([1152, 3, 14, 14]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', torch.Size([729, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight', torch.Size([1152, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight', torch.Size([4304, 1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias', torch.Size([4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight', torch.Size([1152, 4304]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.post_layernorm.weight', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.vision_tower.vision_tower.vision_model.post_layernorm.bias', torch.Size([1152]), torch.bfloat16, False) -('base_model.model.model.mm_projector.original_module.0.weight', torch.Size([3584, 1152]), torch.bfloat16, True) -('base_model.model.model.mm_projector.original_module.0.bias', torch.Size([3584]), torch.bfloat16, True) -('base_model.model.model.mm_projector.original_module.2.weight', torch.Size([3584, 3584]), torch.bfloat16, True) -('base_model.model.model.mm_projector.original_module.2.bias', torch.Size([3584]), torch.bfloat16, True) -('base_model.model.model.mm_projector.modules_to_save.default.0.weight', torch.Size([3584, 1152]), torch.bfloat16, True) -('base_model.model.model.mm_projector.modules_to_save.default.0.bias', torch.Size([3584]), torch.bfloat16, True) -('base_model.model.model.mm_projector.modules_to_save.default.2.weight', torch.Size([3584, 3584]), torch.bfloat16, True) -('base_model.model.model.mm_projector.modules_to_save.default.2.bias', torch.Size([3584]), torch.bfloat16, True) -('base_model.model.lm_head.original_module.weight', torch.Size([152064, 3584]), torch.bfloat16, True) -('base_model.model.lm_head.modules_to_save.default.weight', torch.Size([152064, 3584]), torch.bfloat16, True) -('base_model.model.informative_head.original_module.weight', torch.Size([2, 3584]), torch.bfloat16, True) -('base_model.model.informative_head.modules_to_save.default.weight', torch.Size([2, 3584]), torch.bfloat16, True) -('base_model.model.relevance_head.original_module.weight', torch.Size([1, 3584]), torch.bfloat16, True) -('base_model.model.relevance_head.modules_to_save.default.weight', torch.Size([1, 3584]), torch.bfloat16, True) -('base_model.model.uncertainty_head.original_module.weight', torch.Size([1, 3584]), torch.bfloat16, True) -('base_model.model.uncertainty_head.modules_to_save.default.weight', torch.Size([1, 3584]), torch.bfloat16, True) -load datasets/coin/videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat is the action now? Please response in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30848, 30899), range(31638, 31671)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 5 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease simply describe what do you see.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30838, 30889), range(31628, 31661)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat is the action now? Please response in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30848, 30899), range(31638, 31671)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +Dataset MAGQAStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset MAGQAStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease concisely narrate the video in real time.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1692, 1799), range(5282, 5526), range(9009, 9351), range(13863, 14264)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +Starting Training! +trainable params: 40,370,176 || all params: 8,070,733,344 || trainable%: 0.500204557371633 +base_model.model.model.image_newline torch.Size([3584]) torch.bfloat16 False +base_model.model.model.embed_tokens.weight torch.Size([152064, 3584]) torch.bfloat16 False +base_model.model.model.layers.0.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.0.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.0.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.0.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.0.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.0.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.0.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.0.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.0.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.0.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.0.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.0.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.1.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.1.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.1.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.1.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.1.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.1.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.1.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.1.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.1.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.1.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.1.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.1.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.2.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.2.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.2.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.2.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.2.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.2.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.2.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.2.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.2.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.2.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.2.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.2.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.3.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.3.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.3.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.3.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.3.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.3.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.3.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.3.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.3.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.3.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.3.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.3.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.4.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.4.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.4.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.4.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.4.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.4.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.4.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.4.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.4.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.4.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.4.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.4.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.5.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.5.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.5.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.5.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.5.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.5.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.5.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.5.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.5.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.5.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.5.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.5.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.6.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.6.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.6.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.6.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.6.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.6.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.6.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.6.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.6.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.6.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.6.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.6.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.7.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.7.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.7.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.7.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.7.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.7.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.7.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.7.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.7.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.7.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.7.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.7.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.8.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.8.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.8.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.8.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.8.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.8.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.8.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.8.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.8.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.8.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.8.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.8.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.9.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.9.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.9.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.9.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.9.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.9.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.9.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.9.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.9.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.9.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.9.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.9.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.10.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.10.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.10.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.10.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.10.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.10.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.10.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.10.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.10.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.10.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.10.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.10.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.11.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.11.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.11.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.11.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.11.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.11.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.11.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.11.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.11.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.11.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.11.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.11.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.12.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.12.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.12.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.12.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.12.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.12.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.12.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.12.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.12.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.12.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.12.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.12.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.13.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.13.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.13.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.13.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.13.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.13.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.13.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.13.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.13.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.13.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.13.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.13.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.14.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.14.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.14.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.14.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.14.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.14.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.14.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.14.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.14.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.14.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.14.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.14.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.15.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.15.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.15.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.15.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.15.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.15.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.15.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.15.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.15.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.15.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.15.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.15.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.16.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.16.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.16.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.16.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.16.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.16.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.16.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.16.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.16.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.16.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.16.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.16.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.17.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.17.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.17.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.17.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.17.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.17.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.17.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.17.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.17.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.17.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.17.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.17.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.18.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.18.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.18.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.18.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.18.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.18.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.18.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.18.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.18.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.18.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.18.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.18.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.19.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.19.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.19.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.19.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.19.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.19.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.19.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.19.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.19.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.19.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.19.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.19.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.20.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.20.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.20.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.20.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.20.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.20.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.20.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.20.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.20.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.20.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.20.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.20.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.21.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.21.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.21.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.21.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.21.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.21.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.21.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.21.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.21.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.21.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.21.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.21.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.22.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.22.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.22.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.22.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.22.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.22.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.22.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.22.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.22.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.22.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.22.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.22.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.23.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.23.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.23.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.23.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.23.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.23.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.23.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.23.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.23.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.23.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.23.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.23.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.24.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.24.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.24.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.24.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.24.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.24.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.24.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.24.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.24.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.24.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.24.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.24.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.24.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.24.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.24.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.24.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.24.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.24.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.24.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.24.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.25.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.25.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.25.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.25.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.25.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.25.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.25.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.25.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.25.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.25.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.25.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.25.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.25.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.25.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.25.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.25.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.25.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.25.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.25.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.25.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.26.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.26.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.26.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.26.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.26.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.26.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.26.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.26.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.26.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.26.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.26.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.26.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.26.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.26.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.26.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.26.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.26.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.26.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.26.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.26.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.27.self_attn.q_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.27.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.27.self_attn.k_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.27.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.27.self_attn.v_proj.weight torch.Size([512, 3584]) torch.bfloat16 False +base_model.model.model.layers.27.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 False +base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.weight torch.Size([512, 16]) torch.float32 True +base_model.model.model.layers.27.self_attn.o_proj.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.layers.27.self_attn.o_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.27.self_attn.o_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.27.mlp.gate_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.27.mlp.gate_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.27.mlp.gate_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.27.mlp.up_proj.weight torch.Size([18944, 3584]) torch.bfloat16 False +base_model.model.model.layers.27.mlp.up_proj.lora_A.default.weight torch.Size([16, 3584]) torch.float32 True +base_model.model.model.layers.27.mlp.up_proj.lora_B.default.weight torch.Size([18944, 16]) torch.float32 True +base_model.model.model.layers.27.mlp.down_proj.weight torch.Size([3584, 18944]) torch.bfloat16 False +base_model.model.model.layers.27.mlp.down_proj.lora_A.default.weight torch.Size([16, 18944]) torch.float32 True +base_model.model.model.layers.27.mlp.down_proj.lora_B.default.weight torch.Size([3584, 16]) torch.float32 True +base_model.model.model.layers.27.input_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.layers.27.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.norm.weight torch.Size([3584]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight torch.Size([1152, 3, 14, 14]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight torch.Size([729, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight torch.Size([1152, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight torch.Size([4304, 1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias torch.Size([4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight torch.Size([1152, 4304]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.post_layernorm.weight torch.Size([1152]) torch.bfloat16 False +base_model.model.model.vision_tower.vision_tower.vision_model.post_layernorm.bias torch.Size([1152]) torch.bfloat16 False +base_model.model.model.mm_projector.0.weight torch.Size([3584, 1152]) torch.bfloat16 False +base_model.model.model.mm_projector.0.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.model.mm_projector.2.weight torch.Size([3584, 3584]) torch.bfloat16 False +base_model.model.model.mm_projector.2.bias torch.Size([3584]) torch.bfloat16 False +base_model.model.lm_head.weight torch.Size([152064, 3584]) torch.bfloat16 False +base_model.model.informative_head.weight torch.Size([2, 3584]) torch.bfloat16 False +base_model.model.relevance_head.weight torch.Size([1, 3584]) torch.bfloat16 False +base_model.model.uncertainty_head.weight torch.Size([1, 3584]) torch.bfloat16 False +load datasets/coin/videos_metadata.json... +Dataset DenseVideoCaptioningStreamDataset has 5 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nDo concise real-time narration.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30830, 30881), range(31620, 31653)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset MAGQAStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] +Dataset MAGQAStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHelp me to illustrate my view in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] -load datasets/hisum/videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease simply describe what do you see.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] -load datasets/hisum/videos_metadata.json... -Dataset HiSumDataset has 2 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nCustomized Yamaha MT-01 akrapovic HD (Without DB-killer)<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>', torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] -Dataset HiSumDataset has 2 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat are the key timestamps in the video for the topic 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] -{'train_runtime': 36.0725, 'train_samples_per_second': 0.139, 'train_steps_per_second': 0.028, 'train_loss': 1.973416805267334, 'epoch': 1.0} -Saving model to huggingface +Dataset DenseVideoCaptioningStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease concisely narrate the video in real time.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1692, 1799), range(5282, 5526), range(9009, 9351), range(13863, 14264)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +Starting Training! +[1/3] /home/paperspace/miniconda3/envs/aha/bin/x86_64-conda-linux-gnu-c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -isystem /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/include -isystem /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/include/TH -isystem /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/include/THC -isystem /home/paperspace/miniconda3/envs/aha/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/home/paperspace/miniconda3/envs/aha/lib -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o +[2/3] /home/paperspace/miniconda3/envs/aha/bin/x86_64-conda-linux-gnu-c++ -MMD -MF cpu_adam_impl.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -isystem /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/include -isystem /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/include/TH -isystem /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/include/THC -isystem /home/paperspace/miniconda3/envs/aha/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/home/paperspace/miniconda3/envs/aha/lib -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/cpu_adam_impl.cpp -o cpu_adam_impl.o +[3/3] /home/paperspace/miniconda3/envs/aha/bin/x86_64-conda-linux-gnu-c++ cpu_adam.o cpu_adam_impl.o -shared -lcurand -L/home/paperspace/miniconda3/envs/aha/lib -L/home/paperspace/miniconda3/envs/aha/lib/python3.10/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o cpu_adam.so +Time to load cpu_adam op: 32.586766958236694 seconds +Time to load cpu_adam op: 19.725292444229126 seconds +{'train_runtime': 36.6363, 'train_samples_per_second': 0.464, 'train_steps_per_second': 0.027, 'train_loss': 10.159208297729492, 'epoch': 0.89} +[2025-04-08 06:00:37,762] [INFO] [launch.py:351:main] Process 15421 exits successfully.