2025-04-24 10:21:52,552 INFO [train.py:653] { "allowed_excess_duration_ratio": 0.1, "audio_key": "question_audio", "batch_idx_train": 0, "best_train_epoch": -1, "best_train_loss": Infinity, "best_valid_epoch": -1, "best_valid_loss": Infinity, "bucketing_sampler": true, "deepscale": false, "deepscale_config": null, "deepspeed": true, "deepspeed_config": "./slam_omni/ds_config_zero1.json", "drop_last": true, "enable_musan": false, "enable_spec_aug": true, "enable_speech_output": true, "encoder_projector_ds_rate": 8, "env_info": { "IP address": "0.114.183.253", "hostname": "7518205", "icefall-git-branch": null, "icefall-git-date": null, "icefall-git-sha1": null, "icefall-path": "/workspace/slam/icefall_omni", "k2-build-type": "Release", "k2-git-date": "Tue Oct 29 09:02:19 2024", "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755", "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py", "k2-version": "1.24.4", "k2-with-cuda": true, "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py", "lhotse-version": "1.30.0.dev+git.13c7616f.dirty", "python-version": "3.11", "torch-cuda-available": true, "torch-cuda-version": "12.4", "torch-version": "2.4.0" }, "exp_dir": "slam_omni/exp_speech2speech_rerun", "frame_shift_ms": 10, "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni", "input_strategy": "PrecomputedFeatures", "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct", "log_interval": 50, "manifest_dir": "data/fbank", "max_duration": 50, "num_buckets": 30, "num_epochs": 10, "num_workers": 2, "on_the_fly_feats": false, "pretrained_model_path": null, "resample_to_16kHz": true, "reset_interval": 200, "return_cuts": true, "sampler_state_dict_path": null, "seed": 42, "shuffle": true, "spec_aug_time_warp_factor": 80, "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt", "start_epoch": 1, "subsampling_factor": 2, "tensorboard": true, "text_key": "answer", "unfreeze_llm": true, "unfreeze_speech_projector": true, "use_flash_attn": true, "use_fp16": true, "use_lora": true, "valid_interval": 5000 } 2025-04-24 10:21:52,552 INFO [train.py:655] About to create model 2025-04-24 10:22:24,486 INFO [train.py:808] Number of model parameters: 1327766148 2025-04-24 10:22:24,487 INFO [train.py:810] Trainable parameters (excluding model.eval modules): 2025-04-24 10:22:24,487 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864]) 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64]) 2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240]) 2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896]) 2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896]) 2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896]) 2025-04-24 10:22:24,505 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,505 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64]) 2025-04-24 10:22:24,514 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792]) 2025-04-24 10:22:24,514 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024]) 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100]) 2025-04-24 10:22:24,514 INFO [train.py:819] Device: cuda:3 2025-04-24 10:22:25,742 INFO [train.py:823] Using DeepSpeed 2025-04-24 10:22:52,214 INFO [data_module.py:445] About to get train cuts 2025-04-24 10:22:52,216 INFO [data_module.py:250] Disable MUSAN 2025-04-24 10:22:52,216 INFO [data_module.py:268] Enable SpecAugment 2025-04-24 10:22:52,216 INFO [data_module.py:269] Time warp factor: 80 2025-04-24 10:22:52,216 INFO [data_module.py:279] Num frame mask: 10 2025-04-24 10:22:52,217 INFO [data_module.py:292] About to create train dataset 2025-04-24 10:22:52,217 INFO [data_module.py:319] Using DynamicBucketingSampler. 2025-04-24 10:22:53,219 INFO [data_module.py:336] About to create train dataloader 2025-04-24 10:22:53,219 INFO [data_module.py:436] About to get test cuts 2025-04-24 10:22:53,220 INFO [data_module.py:365] About to create dev dataset 2025-04-24 10:22:53,308 INFO [data_module.py:379] About to create dev dataloader 2025-04-24 10:22:53,315 INFO [train.py:875] start training from epoch 1 2025-04-24 10:23:23,440 INFO [train.py:539] Computing validation loss 2025-04-24 10:23:31,208 INFO [train.py:548] Epoch 1, validation: loss=0.005359, acc=0.4658, codec_acc=0.0004324, codec_topk_acc=0.003407, codec_loss=0.004039, text_loss=0.00132, over 285507.00 frames. 2025-04-24 10:23:31,209 INFO [train.py:549] Maximum memory allocated so far is 12383MB 2025-04-24 10:23:31,687 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.005086, acc=0.4173, codec_acc=0.001085, codec_topk_ac2025-04-24 10:23:31,687 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.004863, acc=0.4196, codec_acc=0.0002915, codec_topk_acc=0.002623, codec_loss=0.00371, text_loss=0.001153, over 2279.00 frames. ], tot_loss[loss=0.004863, acc=0.4196, codec_acc=0.0002025-04-24 10:23:39,106 INFO [train.py:940] Saving batch to slam_omni/exp_speech2speech_rerun/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt 2025-04-24 10:23:39,117 INFO [train.py:945] features shape: torch.Size([4, 1168, 80])