+ echo Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node31.txt
Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//log_node31.txt
+ export ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/31
+ ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/31
+ mkdir -p /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//ascend/31
+ DATA_PATH=/local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml
+ TOKENIZER_PATH=/data_4/models/Qwen/Qwen2.5-14B-Instruct/
+ CKPT_LOAD_DIR=/data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/
+ VIT_CKPT_LOAD_DIR=/
+ CKPT_SAVE_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213//
+ rsync -avh /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/
sending incremental file list

sent 71 bytes  received 12 bytes  166.00 bytes/sec
total size is 23.84K  speedup is 287.17
+ cd /local_disk/cognitron_vl/
+ rm -fr datasets
+ mkdir -p datasets
+ ln -s /data/data/ datasets/CV
+ ln -s /data/data/LLM datasets/LLM
+ ln -s /data/data/LMM datasets/LMM
+ source /local_disk/cognitron_vl//scripts/set_env_mg_npu.sh
++ source /usr/local/Ascend/driver/bin/setenv.bash
+++ DEP_INFO_FILE=/etc/ascend_install.info
+++ [[ -f /etc/ascend_install.info ]]
+++ . /etc/ascend_install.info
+++ DRV_LIB64_COMMON_LDPATH=/driver/lib64/common
+++ DRV_LIB64_DRV_LDPATH=/driver/lib64/driver
+++ DRV_LIB64_LDPATH=/driver/lib64
+++ export LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
++ source /usr/local/Ascend/ascend-toolkit/set_env.sh
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
++++ arch
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+++ PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
+++ ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
+++ export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
+++ TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
+++ export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
++ export HCCL_CONNECT_TIMEOUT=7200
++ HCCL_CONNECT_TIMEOUT=7200
++ export HCCL_EXEC_TIMEOUT=7200
++ HCCL_EXEC_TIMEOUT=7200
++ export COMBINED_ENABLE=1
++ COMBINED_ENABLE=1
++ export MULTI_STREAM_MEMORY_REUSE=1
++ MULTI_STREAM_MEMORY_REUSE=1
++ export HCCL_RDMA_TC=160
++ HCCL_RDMA_TC=160
++ export HCCL_RDMA_SL=5
++ HCCL_RDMA_SL=5
++ export HCCL_INTRA_PCIE_ENABLE=0
++ HCCL_INTRA_PCIE_ENABLE=0
++ export HCCL_INTRA_ROCE_ENABLE=1
++ HCCL_INTRA_ROCE_ENABLE=1
++ export HCCL_RDMA_TIMEOUT=20
++ HCCL_RDMA_TIMEOUT=20
++ export INF_NAN_MODE_ENABLE=1
++ INF_NAN_MODE_ENABLE=1
++ export DISTRIBUTED_BACKEND=hccl
++ DISTRIBUTED_BACKEND=hccl
++ export ASCEND_LAUNCH_BLOCKING=0
++ ASCEND_LAUNCH_BLOCKING=0
++ export ASCEND_SLOG_PRINT_TO_STDOUT=0
++ ASCEND_SLOG_PRINT_TO_STDOUT=0
++ export ASCEND_GLOBAL_LOG_LEVEL=3
++ ASCEND_GLOBAL_LOG_LEVEL=3
++ export ASCEND_GLOBAL_EVENT_ENABLE=0
++ ASCEND_GLOBAL_EVENT_ENABLE=0
++ export TASK_QUEUE_ENABLE=1
++ TASK_QUEUE_ENABLE=1
++ export PTCOPY_ENABLE=1
++ PTCOPY_ENABLE=1
++ export COMBINED_ENABLE=1
++ COMBINED_ENABLE=1
++ export DYNAMIC_OP=ADD#MUL
++ DYNAMIC_OP=ADD#MUL
++ export HCCL_WHITELIST_DISABLE=1
++ HCCL_WHITELIST_DISABLE=1
++ export HCCL_CONNECT_TIMEOUT=7200
++ HCCL_CONNECT_TIMEOUT=7200
++ export HCCL_WHITELIST_DISABLE=1
++ HCCL_WHITELIST_DISABLE=1
++ export CUDA_DEVICE_MAX_CONNECTIONS=1
++ CUDA_DEVICE_MAX_CONNECTIONS=1
++ pip3 install --no-index --find-links=/data/software/ -r requirements_npu.txt
Looking in links: /data/software/
Processing data/software/expecttest-0.2.1-py3-none-any.whl (from -r requirements_npu.txt (line 1))
Requirement already satisfied: peft in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 2)) (0.7.0)
Processing data/software/XlsxWriter-3.2.0-py3-none-any.whl (from -r requirements_npu.txt (line 3))
Requirement already satisfied: termcolor in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 4)) (2.4.0)
Requirement already satisfied: tabulate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 5)) (0.9.0)
Processing data/software/tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 6))
Requirement already satisfied: matplotlib in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 7)) (3.7.5)
Processing data/software/datasets-3.0.0-py3-none-any.whl (from -r requirements_npu.txt (line 8))
Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 9)) (0.7.0)
Processing data/software/pybind11-2.13.6-py3-none-any.whl (from -r requirements_npu.txt (line 10))
Requirement already satisfied: tensorboardX in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 11)) (2.6.2.2)
Processing data/software/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 12))
Requirement already satisfied: transformers>=4.40.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 13)) (4.40.1)
Requirement already satisfied: deepspeed>=0.14.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 14)) (0.14.5)
Processing data/software/accelerate-0.34.2-py3-none-any.whl (from -r requirements_npu.txt (line 15))
Requirement already satisfied: timm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 16)) (0.9.16)
Processing data/software/flask-3.0.3-py3-none-any.whl (from -r requirements_npu.txt (line 17))
Processing data/software/Flask_RESTful-0.3.10-py2.py3-none-any.whl (from -r requirements_npu.txt (line 18))
Processing data/software/decord-0.6.0-py3-none-manylinux2010_x86_64.whl (from -r requirements_npu.txt (line 19))
Processing data/software/natsort-8.4.0-py3-none-any.whl (from -r requirements_npu.txt (line 20))
Requirement already satisfied: numpy>=1.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (1.24.4)
Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (23.2)
Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.9.8)
Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.4.1)
Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (2.1.0+cpu)
Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (4.66.2)
Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.4.2)
Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.20.3)
Requirement already satisfied: regex>=2022.1.18 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2023.12.25)
Requirement already satisfied: requests>=2.26.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2.31.0)
Requirement already satisfied: contourpy>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.1.1)
Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (4.49.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.4.5)
Requirement already satisfied: pillow>=6.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (2.8.2)
Requirement already satisfied: importlib-resources>=3.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (6.1.2)
Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.13.1)
Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.3.7)
Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2.0.3)
Processing data/software/requests-2.32.3-py3-none-any.whl (from tiktoken->-r requirements_npu.txt (line 6))
Processing data/software/tqdm-4.67.1-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.4.1)
Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.70.15)
Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2023.10.0)
Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.9.3)
Processing data/software/huggingface_hub-0.26.2-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: protobuf>=3.20 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tensorboardX->-r requirements_npu.txt (line 11)) (4.25.3)
Requirement already satisfied: tokenizers<0.20,>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers>=4.40.1->-r requirements_npu.txt (line 13)) (0.19.1)
Requirement already satisfied: hjson in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (3.1.0)
Requirement already satisfied: ninja in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.11.1.1)
Requirement already satisfied: nvidia-ml-py in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (12.560.30)
Requirement already satisfied: py-cpuinfo in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (9.0.0)
Requirement already satisfied: pydantic in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.10.15)
Processing data/software/safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: torchvision in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from timm->-r requirements_npu.txt (line 16)) (0.16.0)
Requirement already satisfied: Werkzeug>=3.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.0.1)
Requirement already satisfied: Jinja2>=3.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.1.3)
Processing data/software/itsdangerous-2.2.0-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17))
Requirement already satisfied: click>=8.1.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (8.1.7)
Processing data/software/blinker-1.8.2-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17))
Requirement already satisfied: importlib-metadata>=3.6.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (7.0.1)
Processing data/software/aniso8601-9.0.1-py2.py3-none-any.whl (from flask_restful->-r requirements_npu.txt (line 18))
Requirement already satisfied: six>=1.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (1.16.0)
Requirement already satisfied: pytz in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (2024.1)
Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.3.1)
Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (23.2.0)
Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.4.1)
Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (6.0.5)
Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (4.0.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft->-r requirements_npu.txt (line 2)) (4.10.0)
Requirement already satisfied: zipp>=0.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from importlib-metadata>=3.6.0->flask->-r requirements_npu.txt (line 17)) (3.17.0)
Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from Jinja2>=3.1.2->flask->-r requirements_npu.txt (line 17)) (2.1.5)
Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (1.26.18)
Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (2024.2.2)
Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.4)
Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (3.1)
Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2024.1)
Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.3.0)
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: aniso8601, xlsxwriter, tqdm, safetensors, requests, pybind11, pyarrow, natsort, itsdangerous, expecttest, decord, blinker, tiktoken, huggingface-hub, flask, flask_restful, accelerate, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.66.2
    Uninstalling tqdm-4.66.2:
      Successfully uninstalled tqdm-4.66.2
  Attempting uninstall: safetensors
    Found existing installation: safetensors 0.4.2
    Uninstalling safetensors-0.4.2:
      Successfully uninstalled safetensors-0.4.2
  Attempting uninstall: requests
    Found existing installation: requests 2.31.0
    Uninstalling requests-2.31.0:
      Successfully uninstalled requests-2.31.0
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 15.0.0
    Uninstalling pyarrow-15.0.0:
      Successfully uninstalled pyarrow-15.0.0
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.20.3
    Uninstalling huggingface-hub-0.20.3:
      Successfully uninstalled huggingface-hub-0.20.3
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.25.0
    Uninstalling accelerate-0.25.0:
      Successfully uninstalled accelerate-0.25.0
  Attempting uninstall: datasets
    Found existing installation: datasets 2.16.0
    Uninstalling datasets-2.16.0:
      Successfully uninstalled datasets-2.16.0
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tikit 1.8.2.240926 requires dicttoxml==1.7.4, which is not installed.
tikit 1.8.2.240926 requires docopt==0.6.2, which is not installed.
tikit 1.8.2.240926 requires future==0.18.2, which is not installed.
tikit 1.8.2.240926 requires hdfs==2.6.0, which is not installed.
tikit 1.8.2.240926 requires pure-sasl==0.6.2, which is not installed.
tikit 1.8.2.240926 requires py4j==0.10.7, which is not installed.
tikit 1.8.2.240926 requires PyHive[hive]==0.6.4, which is not installed.
tikit 1.8.2.240926 requires pyjwt>=2.4.0, which is not installed.
tikit 1.8.2.240926 requires requests-kerberos>=0.14.0, which is not installed.
tikit 1.8.2.240926 requires sasl==0.3.1, which is not installed.
tikit 1.8.2.240926 requires thrift==0.15.0, which is not installed.
tikit 1.8.2.240926 requires thrift-sasl>=0.1.0, which is not installed.
tikit 1.8.2.240926 requires certifi==2021.10.8, but you have certifi 2024.2.2 which is incompatible.
tikit 1.8.2.240926 requires cos-python-sdk-v5==1.9.29, but you have cos-python-sdk-v5 1.9.26 which is incompatible.
tikit 1.8.2.240926 requires idna==3.3, but you have idna 3.6 which is incompatible.
tikit 1.8.2.240926 requires prettytable==2.5.0, but you have prettytable 3.11.0 which is incompatible.
tikit 1.8.2.240926 requires urllib3==1.26.7, but you have urllib3 1.26.18 which is incompatible.
tikit 1.8.2.240926 requires wcwidth==0.2.5, but you have wcwidth 0.2.13 which is incompatible.
Successfully installed accelerate-0.34.2 aniso8601-9.0.1 blinker-1.8.2 datasets-3.0.0 decord-0.6.0 expecttest-0.2.1 flask-3.0.3 flask_restful-0.3.10 huggingface-hub-0.26.2 itsdangerous-2.2.0 natsort-8.4.0 pyarrow-17.0.0 pybind11-2.13.6 requests-2.32.3 safetensors-0.4.5 tiktoken-0.7.0 tqdm-4.67.1 xlsxwriter-3.2.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
++ return 0
+ MEGATRON_DIR=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/
+ MINDSPEED_DIR=/local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/
+ MODELLINK_DIR=/local_disk/cognitron_vl//third_party/ModelLink/
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: megatron_core
  Building editable for megatron_core (pyproject.toml): started
  Building editable for megatron_core (pyproject.toml): finished with status 'done'
  Created wheel for megatron_core: filename=megatron_core-0.6.0-0.editable-cp38-cp38-linux_x86_64.whl size=8791 sha256=06d5bd071b6eadb2bc6965a495bd802172dae415af74dd60b1478328d6910bcd
  Stored in directory: /tmp/pip-ephem-wheel-cache-wolh2e_3/wheels/54/9c/d1/d2015aa0c34e791e64d65d19395e5a9a5528f0c63fd519b9ff
Successfully built megatron_core
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: megatron_core
Successfully installed megatron_core-0.6.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
WARNING: Error parsing requirements for tokenizers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/tokenizers-0.19.1.dist-info/METADATA'
WARNING: Error parsing requirements for transformers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/transformers-4.40.1.dist-info/METADATA'
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: mindspeed
  Running setup.py develop for mindspeed
Successfully installed mindspeed-0.6.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/ModelLink/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/ModelLink
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Requirement already satisfied: numpy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.24.4)
Processing data/software/transformers-4.43.2-py3-none-any.whl (from modellink==0.0.1)
Processing data/software/transformers-stream-generator-0.0.5.tar.gz (from modellink==0.0.1)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.4)
Requirement already satisfied: decorator in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (5.1.1)
Requirement already satisfied: scipy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.10.1)
Requirement already satisfied: sentencepiece in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.2.0)
Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0)
Requirement already satisfied: datasets in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (3.0.0)
Requirement already satisfied: pybind11 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (2.13.6)
Requirement already satisfied: accelerate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.34.2)
Requirement already satisfied: six in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.16.0)
Requirement already satisfied: protobuf in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (4.25.3)
Processing data/software/peft-0.7.1-py3-none-any.whl (from modellink==0.0.1)
Requirement already satisfied: tiktoken in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0)
Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (23.2)
Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.9.8)
Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.4.1)
Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (2.1.0+cpu)
Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (4.67.1)
Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.4.5)
Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.26.2)
Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (3.13.1)
Requirement already satisfied: regex!=2019.12.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2023.12.25)
Requirement already satisfied: requests in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2.32.3)
Processing data/software/tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.43.2->modellink==0.0.1)
Requirement already satisfied: pyarrow>=15.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (17.0.0)
Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.3.7)
Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (2.0.3)
Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.4.1)
Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.70.15)
Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets->modellink==0.0.1) (2023.10.0)
Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.9.3)
Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->modellink==0.0.1) (1.3.0)
Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.3.1)
Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (23.2.0)
Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.4.1)
Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (6.0.5)
Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (4.0.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft==0.7.1->modellink==0.0.1) (4.10.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (1.26.18)
Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (2024.2.2)
Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1)
Requirement already satisfied: jinja2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1.3)
Requirement already satisfied: python-dateutil>=2.8.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1)
Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1)
Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from jinja2->torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (2.1.5)
Building wheels for collected packages: transformers_stream_generator
  Building wheel for transformers_stream_generator (setup.py): started
  Building wheel for transformers_stream_generator (setup.py): finished with status 'done'
  Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=3ed62a866ab10917ceed94a0bafc0596380802f798ed67b7de78b76fe0b65f1f
  Stored in directory: /root/.cache/pip/wheels/56/8c/42/5381d9c36bc85f28982f4cf8f98dc44d37a6d6c04897a5cb7c
Successfully built transformers_stream_generator
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: tokenizers, transformers, transformers_stream_generator, peft, modellink
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.3
    Uninstalling transformers-4.46.3:
      Successfully uninstalled transformers-4.46.3
  Attempting uninstall: peft
    Found existing installation: peft 0.7.0
    Uninstalling peft-0.7.0:
      Successfully uninstalled peft-0.7.0
  Running setup.py develop for modellink
Successfully installed modellink-0.0.1 peft-0.7.1 tokenizers-0.19.1 transformers-4.43.2 transformers_stream_generator-0.0.5
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ export PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+ PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+ GPUS_PER_NODE=16
+ NNODES=32
+ NODE_RANK=31
+ MASTER_PORT=34567
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ VISION_SEQ_LENGTH=1025
+ IMAGE_TOKEN_LENGTH=256
+ IMAGE_SIZE=448
+ VISION_MODEL_TYPE=intern_300m
+ TP=8
+ PP=1
+ CP=2
+ CP_ALGO=megatron_cp_algo
+ CP_MASK=causal
+ DISTRIBUTED_ARGS='
    --nproc_per_node 16     --nnodes 32     --node_rank 31     --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local     --master_port 34567
'
+ GPT_ARGS='
    --use-mcore-models     --tensor-model-parallel-size 8     --pipeline-model-parallel-size 1     --context-parallel-size 2     --context-parallel-algo megatron_cp_algo     --cp-attention-mask-type causal     --use-cp-send-recv-overlap     --no-create-attention-mask-in-dataloader     --sparse-mode 4     --sequence-parallel     --recompute-method block     --recompute-granularity full     --recompute-num-layers 48     --num-layers 48     --hidden-size 5120     --ffn-hidden-size 13824     --num-attention-heads 40     --group-query-attention     --num-query-groups 8     --tokenizer-type PretrainedFromHF     --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/     --seq-length 131072     --max-position-embeddings 131072     --micro-batch-size 1     --global-batch-size 64     --make-vocab-size-divisible-by 1     --padded-vocab-size 152064     --rotary-base 1000000.0     --lr 5.00e-6     --train-iters 1000     --lr-decay-style cosine     --untie-embeddings-and-output-weights     --disable-bias-linear     --attention-dropout 0.0     --init-method-std 0.01     --hidden-dropout 0.0     --position-embedding-type rope     --normalization RMSNorm     --use-fused-rmsnorm     --norm-epsilon 1e-6     --swiglu     --use-flash-attn     --use-fused-rotary-pos-emb     --use-rotary-position-embeddings     --use-fused-swiglu     --use-mc2     --no-masked-softmax-fusion     --attention-softmax-in-fp32     --min-lr 1.00e-7     --weight-decay 0.0     --lr-warmup-fraction 0.03     --clip-grad 1.0     --adam-beta1 0.9     --adam-beta2 0.999     --add-qkv-bias     --initial-loss-scale 4096     --no-gradient-accumulation-fusion     --use-distributed-optimizer     --bf16     --overlap-grad-reduce     --finetune     --vision-model-freeze     --vision-model-type intern_300m     --vision-downsample-ratio 0.5     --vision-projector-type mlp     --vision-projector-pre-norm     --vision-process-type dynamic     --vision-normalize-type imagenet     --vision-seq-length 1025     --image-token-length 256     --image-size 448     --prompt-format qwen2     --is-instruction-dataset     --max-num-frame 512     --max-fps 1     --add-class-token     --min-patch-grid 1     --max-patch-grid 12     --cross-dataset-joint '
+ DATA_ARGS='
    --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml     --split 100,0,0     --data-seq-length 131072     --num-workers 8 '
+ CKPT_ARGS='
    --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/     --vit-load /     --no-load-optim     --no-load-rng     --seed 424242     --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// '
+ OUTPUT_ARGS='
    --log-interval 1     --save-interval 100     --eval-interval 100     --eval-iters 0     --log-throughput     --distributed-timeout-minutes 120 '
+ torchrun --nproc_per_node 16 --nnodes 32 --node_rank 31 --master_addr train-1197954740059955456-93njiyzl9b0g-master-0.train-100034032793.svc.cluster.local --master_port 34567 /local_disk/cognitron_vl//lcvlm_modellink/pretrain_lcvlm.py --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 131072 --max-position-embeddings 131072 --micro-batch-size 1 --global-batch-size 64 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 1000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-frame 512 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --cross-dataset-joint --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage3.yaml --split 100,0,0 --data-seq-length 131072 --num-workers 8 --load /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/ --vit-load / --no-load-optim --no-load-rng --seed 424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213// --log-interval 1 --save-interval 100 --eval-interval 100 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 --distributed-backend nccl
[2024-11-27 12:44:15,622] torch.distributed.run: [WARNING] 
[2024-11-27 12:44:15,622] torch.distributed.run: [WARNING] *****************************************
[2024-11-27 12:44:15,622] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
[2024-11-27 12:44:15,622] torch.distributed.run: [WARNING] *****************************************
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...

Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp...
Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/py38_cpu/adaptive_cp/build.ninja...
Building extension module adaptive_cp...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
[1/2] c++ -MMD -MF adaptive_cp.o.d -DTORCH_EXTENSION_NAME=adaptive_cp -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/Ascend/ascend-toolkit/latest/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/third_party -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/acl -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/inc -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/TH -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/THC -isystem /root/miniconda3/envs/py38/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -fPIC -pie -Wl,--disable-new-dtags,--rpath -s -O2 -c local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/ops/csrc/algorithm/adaptive_cp/adaptive_cp.cpp -o adaptive_cp.o 
[2/2] c++ adaptive_cp.o -shared -L/usr/local/Ascend/ascend-toolkit/latest/lib64 -lascendcl -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/lib -ltorch_npu -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o adaptive_cp.so
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
> compiling dataset index builder ...
make: Entering directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets'
make: Nothing to be done for 'default'.
make: Leaving directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets'
>>> done with dataset index builder. Compilation time: 0.077 seconds
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute Falsevision_projector_recompute False

vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
vision_model_freeze
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.vision_model_freeze
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.




=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.



=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
vision_model_freeze
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
vision_model_freezemodel GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
vision_model_freezevision_model_freeze

=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)




_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)




_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)




_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)





_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)





_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)




_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)




_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)





_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)





_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)




_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)




_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)




_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)



_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration _load_base_checkpoint iteration   5000  _load_base_checkpoint iteration 5000  50005000_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration
50005000
  5000_load_base_checkpoint iteration50005000


_load_base_checkpoint release_load_base_checkpoint release_load_base_checkpoint release_load_base_checkpoint release 
False 
_load_base_checkpoint release 50005000
 

   False _load_base_checkpoint release5000   5000

_load_base_checkpoint release5000_load_base_checkpoint release_load_base_checkpoint release 5000FalseFalse
False
False5000False_load_base_checkpoint release
 _load_base_checkpoint release _load_base_checkpoint release 




_load_base_checkpoint release

 
 False FalseFalse_load_base_checkpoint release_load_base_checkpoint releaseFalse_load_base_checkpoint release 
False
False

  False

FalseFalse


_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_02/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_04/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_07/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_05/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_01/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_03/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_06/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/lcvlm_modellink/scripts/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1_stage2.sh/20241014_131952/iter_0005000/mp_rank_00/model_optim_rng.pt
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
(min, max) time across ranks (ms):
    load-checkpoint ................................: (11120.76, 11123.74)
> rank 500 does not create GPT datasets ...
> rank 497 does not create GPT datasets ...
> rank 507 does not create GPT datasets ...> rank 499 does not create GPT datasets ...

> rank 506 does not create GPT datasets ...
> rank 498 does not create GPT datasets ...
> rank 501 does not create GPT datasets ...
> rank 509 does not create GPT datasets ...
> rank 502 does not create GPT datasets ...
> rank 510 does not create GPT datasets ...> rank 505 does not create GPT datasets ...

> rank 511 does not create GPT datasets ...
> rank 503 does not create GPT datasets ...
> rank 508 does not create GPT datasets ...
> rank 496 is creating GPT datasets ...
> rank 504 is creating GPT datasets ...
target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)]
possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]]
target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)]
possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]]
(min, max) time across ranks (ms):
    model-and-optimizer-setup ......................: (11717.92, 11729.42)
    train/valid/test-data-iterators-setup ..........: (287040.22, 287455.78)
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
................................................................................................ [2024-11-27 12:54:56] iteration        1/    1000 | consumed samples:           64 | elapsed time per iteration (ms): 279519.2 | throughput per GPU (TFLOP/s/GPU): 27.6 | learning rate: 1.666667E-07 | global batch size:    64 | lm loss: 1.334420E+00 | loss scale: 1.0 | grad norm: 7.828 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555def251b40] Missing reference picture, default is 65530
[h264 @ 0x555def251b40] Missing reference picture, default is 65530
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] Missing reference picture, default is 65530
[h264 @ 0x555def251b40] Missing reference picture, default is 65530
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530
[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530
[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] Missing reference picture, default is 65530
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] Missing reference picture, default is 65530
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-27 12:57:02] iteration        2/    1000 | consumed samples:          128 | elapsed time per iteration (ms): 126706.1 | throughput per GPU (TFLOP/s/GPU): 60.8 | learning rate: 3.333333E-07 | global batch size:    64 | lm loss: 9.522201E-01 | loss scale: 1.0 | grad norm: 2.493 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 12:58:57] iteration        3/    1000 | consumed samples:          192 | elapsed time per iteration (ms): 114541.0 | throughput per GPU (TFLOP/s/GPU): 67.3 | learning rate: 5.000000E-07 | global batch size:    64 | lm loss: 9.801381E-01 | loss scale: 1.0 | grad norm: 86.020 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:00:25] iteration        4/    1000 | consumed samples:          256 | elapsed time per iteration (ms): 88395.0 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 6.666667E-07 | global batch size:    64 | lm loss: 9.349928E-01 | loss scale: 1.0 | grad norm: 4.715 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 13:01:40] iteration        5/    1000 | consumed samples:          320 | elapsed time per iteration (ms): 74663.0 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 8.333333E-07 | global batch size:    64 | lm loss: 8.840026E-01 | loss scale: 1.0 | grad norm: 3.307 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-27 13:02:58] iteration        6/    1000 | consumed samples:          384 | elapsed time per iteration (ms): 78027.0 | throughput per GPU (TFLOP/s/GPU): 98.8 | learning rate: 1.000000E-06 | global batch size:    64 | lm loss: 1.074550E+00 | loss scale: 1.0 | grad norm: 8.121 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
 [2024-11-27 13:04:36] iteration        7/    1000 | consumed samples:          448 | elapsed time per iteration (ms): 98050.4 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 1.166667E-06 | global batch size:    64 | lm loss: 1.004752E+00 | loss scale: 1.0 | grad norm: 5.474 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
 [2024-11-27 13:06:20] iteration        8/    1000 | consumed samples:          512 | elapsed time per iteration (ms): 103726.0 | throughput per GPU (TFLOP/s/GPU): 74.3 | learning rate: 1.333333E-06 | global batch size:    64 | lm loss: 9.457669E-01 | loss scale: 1.0 | grad norm: 3.092 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
 [2024-11-27 13:07:53] iteration        9/    1000 | consumed samples:          576 | elapsed time per iteration (ms): 93356.3 | throughput per GPU (TFLOP/s/GPU): 82.6 | learning rate: 1.500000E-06 | global batch size:    64 | lm loss: 9.999593E-01 | loss scale: 1.0 | grad norm: 3.454 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-27 13:09:33] iteration       10/    1000 | consumed samples:          640 | elapsed time per iteration (ms): 99857.5 | throughput per GPU (TFLOP/s/GPU): 77.2 | learning rate: 1.666667E-06 | global batch size:    64 | lm loss: 8.829347E-01 | loss scale: 1.0 | grad norm: 1.931 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 13:11:17] iteration       11/    1000 | consumed samples:          704 | elapsed time per iteration (ms): 103591.1 | throughput per GPU (TFLOP/s/GPU): 74.4 | learning rate: 1.833333E-06 | global batch size:    64 | lm loss: 1.001601E+00 | loss scale: 1.0 | grad norm: 3.746 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:12:47] iteration       12/    1000 | consumed samples:          768 | elapsed time per iteration (ms): 90256.0 | throughput per GPU (TFLOP/s/GPU): 85.4 | learning rate: 2.000000E-06 | global batch size:    64 | lm loss: 9.547743E-01 | loss scale: 1.0 | grad norm: 2.667 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-27 13:14:15] iteration       13/    1000 | consumed samples:          832 | elapsed time per iteration (ms): 88156.2 | throughput per GPU (TFLOP/s/GPU): 87.4 | learning rate: 2.166667E-06 | global batch size:    64 | lm loss: 1.024508E+00 | loss scale: 1.0 | grad norm: 9.773 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 13:15:46] iteration       14/    1000 | consumed samples:          896 | elapsed time per iteration (ms): 91099.2 | throughput per GPU (TFLOP/s/GPU): 84.6 | learning rate: 2.333333E-06 | global batch size:    64 | lm loss: 8.997112E-01 | loss scale: 1.0 | grad norm: 2.957 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 13:17:08] iteration       15/    1000 | consumed samples:          960 | elapsed time per iteration (ms): 82038.9 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 2.500000E-06 | global batch size:    64 | lm loss: 8.934980E-01 | loss scale: 1.0 | grad norm: 6.978 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 13:18:54] iteration       16/    1000 | consumed samples:         1024 | elapsed time per iteration (ms): 106026.4 | throughput per GPU (TFLOP/s/GPU): 72.7 | learning rate: 2.666667E-06 | global batch size:    64 | lm loss: 7.564893E-01 | loss scale: 1.0 | grad norm: 1.403 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-27 13:20:39] iteration       17/    1000 | consumed samples:         1088 | elapsed time per iteration (ms): 104400.9 | throughput per GPU (TFLOP/s/GPU): 73.8 | learning rate: 2.833333E-06 | global batch size:    64 | lm loss: 7.983471E-01 | loss scale: 1.0 | grad norm: 1.099 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:22:03] iteration       18/    1000 | consumed samples:         1152 | elapsed time per iteration (ms): 84307.6 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 3.000000E-06 | global batch size:    64 | lm loss: 8.053264E-01 | loss scale: 1.0 | grad norm: 1.238 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 13:24:02] iteration       19/    1000 | consumed samples:         1216 | elapsed time per iteration (ms): 118484.8 | throughput per GPU (TFLOP/s/GPU): 65.1 | learning rate: 3.166667E-06 | global batch size:    64 | lm loss: 7.831764E-01 | loss scale: 1.0 | grad norm: 1.214 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-27 13:25:59] iteration       20/    1000 | consumed samples:         1280 | elapsed time per iteration (ms): 117173.2 | throughput per GPU (TFLOP/s/GPU): 65.8 | learning rate: 3.333333E-06 | global batch size:    64 | lm loss: 7.992147E-01 | loss scale: 1.0 | grad norm: 1.085 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:27:36] iteration       21/    1000 | consumed samples:         1344 | elapsed time per iteration (ms): 97623.0 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 3.500000E-06 | global batch size:    64 | lm loss: 8.462799E-01 | loss scale: 1.0 | grad norm: 1.922 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 13:29:03] iteration       22/    1000 | consumed samples:         1408 | elapsed time per iteration (ms): 87114.5 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 3.666667E-06 | global batch size:    64 | lm loss: 7.865314E-01 | loss scale: 1.0 | grad norm: 1.322 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:30:21] iteration       23/    1000 | consumed samples:         1472 | elapsed time per iteration (ms): 77282.9 | throughput per GPU (TFLOP/s/GPU): 99.7 | learning rate: 3.833333E-06 | global batch size:    64 | lm loss: 7.984553E-01 | loss scale: 1.0 | grad norm: 1.140 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 13:31:44] iteration       24/    1000 | consumed samples:         1536 | elapsed time per iteration (ms): 83399.5 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 4.000000E-06 | global batch size:    64 | lm loss: 7.098362E-01 | loss scale: 1.0 | grad norm: 1.013 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-27 13:32:54] iteration       25/    1000 | consumed samples:         1600 | elapsed time per iteration (ms): 70320.5 | throughput per GPU (TFLOP/s/GPU): 109.6 | learning rate: 4.166667E-06 | global batch size:    64 | lm loss: 7.817208E-01 | loss scale: 1.0 | grad norm: 1.044 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
 [2024-11-27 13:34:26] iteration       26/    1000 | consumed samples:         1664 | elapsed time per iteration (ms): 91962.2 | throughput per GPU (TFLOP/s/GPU): 83.8 | learning rate: 4.333333E-06 | global batch size:    64 | lm loss: 7.768258E-01 | loss scale: 1.0 | grad norm: 1.056 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
 [2024-11-27 13:36:13] iteration       27/    1000 | consumed samples:         1728 | elapsed time per iteration (ms): 106582.5 | throughput per GPU (TFLOP/s/GPU): 72.3 | learning rate: 4.500000E-06 | global batch size:    64 | lm loss: 7.915913E-01 | loss scale: 1.0 | grad norm: 1.039 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:38:00] iteration       28/    1000 | consumed samples:         1792 | elapsed time per iteration (ms): 107384.1 | throughput per GPU (TFLOP/s/GPU): 71.8 | learning rate: 4.666667E-06 | global batch size:    64 | lm loss: 8.112209E-01 | loss scale: 1.0 | grad norm: 1.072 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
 [2024-11-27 13:39:28] iteration       29/    1000 | consumed samples:         1856 | elapsed time per iteration (ms): 87109.0 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 4.833333E-06 | global batch size:    64 | lm loss: 7.587413E-01 | loss scale: 1.0 | grad norm: 0.839 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 13:41:04] iteration       30/    1000 | consumed samples:         1920 | elapsed time per iteration (ms): 96509.7 | throughput per GPU (TFLOP/s/GPU): 79.9 | learning rate: 5.000000E-06 | global batch size:    64 | lm loss: 7.194266E-01 | loss scale: 1.0 | grad norm: 0.857 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 13:42:18] iteration       31/    1000 | consumed samples:         1984 | elapsed time per iteration (ms): 74196.3 | throughput per GPU (TFLOP/s/GPU): 103.9 | learning rate: 4.999987E-06 | global batch size:    64 | lm loss: 7.742970E-01 | loss scale: 1.0 | grad norm: 0.987 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:43:48] iteration       32/    1000 | consumed samples:         2048 | elapsed time per iteration (ms): 90021.2 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 4.999949E-06 | global batch size:    64 | lm loss: 8.077890E-01 | loss scale: 1.0 | grad norm: 1.652 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dec5a5140] Missing reference picture, default is 65530
[h264 @ 0x555dec5a5140] Missing reference picture, default is 65530
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] Missing reference picture, default is 65530
[h264 @ 0x555dec5a5140] Missing reference picture, default is 65530
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d9569edbc0] Missing reference picture, default is 65530
[h264 @ 0x55d9569edbc0] Missing reference picture, default is 65530
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] Missing reference picture, default is 65530
[h264 @ 0x55d9569edbc0] Missing reference picture, default is 65530
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 13:45:32] iteration       33/    1000 | consumed samples:         2112 | elapsed time per iteration (ms): 103808.1 | throughput per GPU (TFLOP/s/GPU): 74.3 | learning rate: 4.999884E-06 | global batch size:    64 | lm loss: 7.570043E-01 | loss scale: 1.0 | grad norm: 0.894 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:47:09] iteration       34/    1000 | consumed samples:         2176 | elapsed time per iteration (ms): 96719.9 | throughput per GPU (TFLOP/s/GPU): 79.7 | learning rate: 4.999794E-06 | global batch size:    64 | lm loss: 7.298007E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 13:48:35] iteration       35/    1000 | consumed samples:         2240 | elapsed time per iteration (ms): 85660.6 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 4.999679E-06 | global batch size:    64 | lm loss: 7.571429E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] Missing reference picture, default is 65530
[h264 @ 0x555df1cb0600] Missing reference picture, default is 65530
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] Missing reference picture, default is 65530
[h264 @ 0x555df1cb0600] Missing reference picture, default is 65530
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] Missing reference picture, default is 65530
[h264 @ 0x55d957fc08c0] Missing reference picture, default is 65530
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] Missing reference picture, default is 65530
[h264 @ 0x55d957fc08c0] Missing reference picture, default is 65530
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
 [2024-11-27 13:50:08] iteration       36/    1000 | consumed samples:         2304 | elapsed time per iteration (ms): 93714.6 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 4.999537E-06 | global batch size:    64 | lm loss: 7.604092E-01 | loss scale: 1.0 | grad norm: 1.009 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 13:51:33] iteration       37/    1000 | consumed samples:         2368 | elapsed time per iteration (ms): 84669.5 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 4.999370E-06 | global batch size:    64 | lm loss: 7.159459E-01 | loss scale: 1.0 | grad norm: 0.879 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-27 13:53:03] iteration       38/    1000 | consumed samples:         2432 | elapsed time per iteration (ms): 90181.2 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 4.999178E-06 | global batch size:    64 | lm loss: 7.358369E-01 | loss scale: 1.0 | grad norm: 1.060 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 13:54:22] iteration       39/    1000 | consumed samples:         2496 | elapsed time per iteration (ms): 79226.7 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 4.998959E-06 | global batch size:    64 | lm loss: 7.418987E-01 | loss scale: 1.0 | grad norm: 0.799 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-27 13:55:49] iteration       40/    1000 | consumed samples:         2560 | elapsed time per iteration (ms): 86430.1 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 4.998715E-06 | global batch size:    64 | lm loss: 7.414553E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-27 13:57:24] iteration       41/    1000 | consumed samples:         2624 | elapsed time per iteration (ms): 95156.8 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 4.998445E-06 | global batch size:    64 | lm loss: 7.269660E-01 | loss scale: 1.0 | grad norm: 0.875 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-27 13:58:48] iteration       42/    1000 | consumed samples:         2688 | elapsed time per iteration (ms): 84230.2 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 4.998150E-06 | global batch size:    64 | lm loss: 6.718149E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
 [2024-11-27 14:00:19] iteration       43/    1000 | consumed samples:         2752 | elapsed time per iteration (ms): 91001.1 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 4.997829E-06 | global batch size:    64 | lm loss: 8.040012E-01 | loss scale: 1.0 | grad norm: 1.645 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-27 14:01:43] iteration       44/    1000 | consumed samples:         2816 | elapsed time per iteration (ms): 83601.6 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 4.997482E-06 | global batch size:    64 | lm loss: 7.258285E-01 | loss scale: 1.0 | grad norm: 0.891 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
 [2024-11-27 14:02:59] iteration       45/    1000 | consumed samples:         2880 | elapsed time per iteration (ms): 75874.0 | throughput per GPU (TFLOP/s/GPU): 101.6 | learning rate: 4.997109E-06 | global batch size:    64 | lm loss: 7.296727E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 14:04:26] iteration       46/    1000 | consumed samples:         2944 | elapsed time per iteration (ms): 87483.0 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.996711E-06 | global batch size:    64 | lm loss: 7.469407E-01 | loss scale: 1.0 | grad norm: 0.989 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
 [2024-11-27 14:05:36] iteration       47/    1000 | consumed samples:         3008 | elapsed time per iteration (ms): 69902.1 | throughput per GPU (TFLOP/s/GPU): 110.3 | learning rate: 4.996287E-06 | global batch size:    64 | lm loss: 6.770863E-01 | loss scale: 1.0 | grad norm: 1.086 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
 [2024-11-27 14:06:49] iteration       48/    1000 | consumed samples:         3072 | elapsed time per iteration (ms): 73109.8 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.995838E-06 | global batch size:    64 | lm loss: 7.589791E-01 | loss scale: 1.0 | grad norm: 0.894 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 14:08:27] iteration       49/    1000 | consumed samples:         3136 | elapsed time per iteration (ms): 97771.8 | throughput per GPU (TFLOP/s/GPU): 78.8 | learning rate: 4.995363E-06 | global batch size:    64 | lm loss: 7.114014E-01 | loss scale: 1.0 | grad norm: 0.783 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
 [2024-11-27 14:10:59] iteration       50/    1000 | consumed samples:         3200 | elapsed time per iteration (ms): 151487.1 | throughput per GPU (TFLOP/s/GPU): 50.9 | learning rate: 4.994862E-06 | global batch size:    64 | lm loss: 6.831369E-01 | loss scale: 1.0 | grad norm: 0.813 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 14:12:58] iteration       51/    1000 | consumed samples:         3264 | elapsed time per iteration (ms): 119229.3 | throughput per GPU (TFLOP/s/GPU): 64.7 | learning rate: 4.994335E-06 | global batch size:    64 | lm loss: 6.711353E-01 | loss scale: 1.0 | grad norm: 0.795 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-27 14:14:21] iteration       52/    1000 | consumed samples:         3328 | elapsed time per iteration (ms): 82743.2 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 4.993783E-06 | global batch size:    64 | lm loss: 6.961546E-01 | loss scale: 1.0 | grad norm: 0.688 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 14:15:36] iteration       53/    1000 | consumed samples:         3392 | elapsed time per iteration (ms): 75412.8 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 4.993206E-06 | global batch size:    64 | lm loss: 7.831711E-01 | loss scale: 1.0 | grad norm: 0.820 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-27 14:17:03] iteration       54/    1000 | consumed samples:         3456 | elapsed time per iteration (ms): 86695.8 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 4.992602E-06 | global batch size:    64 | lm loss: 7.702816E-01 | loss scale: 1.0 | grad norm: 1.704 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-27 14:18:34] iteration       55/    1000 | consumed samples:         3520 | elapsed time per iteration (ms): 90982.9 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 4.991973E-06 | global batch size:    64 | lm loss: 7.983244E-01 | loss scale: 1.0 | grad norm: 0.989 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 14:20:14] iteration       56/    1000 | consumed samples:         3584 | elapsed time per iteration (ms): 100724.3 | throughput per GPU (TFLOP/s/GPU): 76.5 | learning rate: 4.991319E-06 | global batch size:    64 | lm loss: 6.829706E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
 [2024-11-27 14:21:37] iteration       57/    1000 | consumed samples:         3648 | elapsed time per iteration (ms): 82850.4 | throughput per GPU (TFLOP/s/GPU): 93.0 | learning rate: 4.990639E-06 | global batch size:    64 | lm loss: 6.990687E-01 | loss scale: 1.0 | grad norm: 0.758 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-27 14:23:03] iteration       58/    1000 | consumed samples:         3712 | elapsed time per iteration (ms): 85987.9 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 4.989933E-06 | global batch size:    64 | lm loss: 7.449348E-01 | loss scale: 1.0 | grad norm: 0.772 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
 [2024-11-27 14:24:15] iteration       59/    1000 | consumed samples:         3776 | elapsed time per iteration (ms): 71954.2 | throughput per GPU (TFLOP/s/GPU): 107.1 | learning rate: 4.989201E-06 | global batch size:    64 | lm loss: 7.466602E-01 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-27 14:25:53] iteration       60/    1000 | consumed samples:         3840 | elapsed time per iteration (ms): 97724.8 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 4.988444E-06 | global batch size:    64 | lm loss: 7.879194E-01 | loss scale: 1.0 | grad norm: 0.734 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 14:27:15] iteration       61/    1000 | consumed samples:         3904 | elapsed time per iteration (ms): 82152.3 | throughput per GPU (TFLOP/s/GPU): 93.8 | learning rate: 4.987662E-06 | global batch size:    64 | lm loss: 7.262596E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dede47780] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-27 14:28:32] iteration       62/    1000 | consumed samples:         3968 | elapsed time per iteration (ms): 76831.2 | throughput per GPU (TFLOP/s/GPU): 100.3 | learning rate: 4.986854E-06 | global batch size:    64 | lm loss: 7.915837E-01 | loss scale: 1.0 | grad norm: 0.811 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
 [2024-11-27 14:30:03] iteration       63/    1000 | consumed samples:         4032 | elapsed time per iteration (ms): 91242.8 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 4.986020E-06 | global batch size:    64 | lm loss: 9.017408E-01 | loss scale: 1.0 | grad norm: 0.846 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-27 14:31:13] iteration       64/    1000 | consumed samples:         4096 | elapsed time per iteration (ms): 70065.0 | throughput per GPU (TFLOP/s/GPU): 110.0 | learning rate: 4.985161E-06 | global batch size:    64 | lm loss: 7.884458E-01 | loss scale: 1.0 | grad norm: 0.670 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 14:33:07] iteration       65/    1000 | consumed samples:         4160 | elapsed time per iteration (ms): 113360.7 | throughput per GPU (TFLOP/s/GPU): 68.0 | learning rate: 4.984276E-06 | global batch size:    64 | lm loss: 7.222143E-01 | loss scale: 1.0 | grad norm: 0.766 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 14:34:30] iteration       66/    1000 | consumed samples:         4224 | elapsed time per iteration (ms): 83724.4 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 4.983366E-06 | global batch size:    64 | lm loss: 7.112570E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
 [2024-11-27 14:35:55] iteration       67/    1000 | consumed samples:         4288 | elapsed time per iteration (ms): 84730.4 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 4.982430E-06 | global batch size:    64 | lm loss: 7.537538E-01 | loss scale: 1.0 | grad norm: 0.773 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-27 14:37:12] iteration       68/    1000 | consumed samples:         4352 | elapsed time per iteration (ms): 76790.5 | throughput per GPU (TFLOP/s/GPU): 100.4 | learning rate: 4.981468E-06 | global batch size:    64 | lm loss: 6.872675E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-27 14:38:48] iteration       69/    1000 | consumed samples:         4416 | elapsed time per iteration (ms): 95911.3 | throughput per GPU (TFLOP/s/GPU): 80.4 | learning rate: 4.980482E-06 | global batch size:    64 | lm loss: 7.328756E-01 | loss scale: 1.0 | grad norm: 0.788 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-27 14:41:37] iteration       70/    1000 | consumed samples:         4480 | elapsed time per iteration (ms): 169115.5 | throughput per GPU (TFLOP/s/GPU): 45.6 | learning rate: 4.979469E-06 | global batch size:    64 | lm loss: 7.115982E-01 | loss scale: 1.0 | grad norm: 0.753 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
 [2024-11-27 14:43:00] iteration       71/    1000 | consumed samples:         4544 | elapsed time per iteration (ms): 82520.7 | throughput per GPU (TFLOP/s/GPU): 93.4 | learning rate: 4.978431E-06 | global batch size:    64 | lm loss: 7.005649E-01 | loss scale: 1.0 | grad norm: 0.698 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 14:44:33] iteration       72/    1000 | consumed samples:         4608 | elapsed time per iteration (ms): 93772.7 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 4.977368E-06 | global batch size:    64 | lm loss: 8.310930E-01 | loss scale: 1.0 | grad norm: 0.999 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 14:46:24] iteration       73/    1000 | consumed samples:         4672 | elapsed time per iteration (ms): 111174.0 | throughput per GPU (TFLOP/s/GPU): 69.3 | learning rate: 4.976279E-06 | global batch size:    64 | lm loss: 7.123011E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 14:47:50] iteration       74/    1000 | consumed samples:         4736 | elapsed time per iteration (ms): 85140.8 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 4.975165E-06 | global batch size:    64 | lm loss: 8.185971E-01 | loss scale: 1.0 | grad norm: 0.927 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
 [2024-11-27 14:49:33] iteration       75/    1000 | consumed samples:         4800 | elapsed time per iteration (ms): 103443.5 | throughput per GPU (TFLOP/s/GPU): 74.5 | learning rate: 4.974025E-06 | global batch size:    64 | lm loss: 6.794741E-01 | loss scale: 1.0 | grad norm: 0.943 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-27 14:51:01] iteration       76/    1000 | consumed samples:         4864 | elapsed time per iteration (ms): 88112.8 | throughput per GPU (TFLOP/s/GPU): 87.5 | learning rate: 4.972860E-06 | global batch size:    64 | lm loss: 6.982243E-01 | loss scale: 1.0 | grad norm: 0.802 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
 [2024-11-27 14:52:16] iteration       77/    1000 | consumed samples:         4928 | elapsed time per iteration (ms): 75137.7 | throughput per GPU (TFLOP/s/GPU): 102.6 | learning rate: 4.971670E-06 | global batch size:    64 | lm loss: 7.333058E-01 | loss scale: 1.0 | grad norm: 1.006 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 14:53:31] iteration       78/    1000 | consumed samples:         4992 | elapsed time per iteration (ms): 74671.8 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 4.970454E-06 | global batch size:    64 | lm loss: 6.710973E-01 | loss scale: 1.0 | grad norm: 0.689 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 14:54:51] iteration       79/    1000 | consumed samples:         5056 | elapsed time per iteration (ms): 80178.4 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 4.969213E-06 | global batch size:    64 | lm loss: 6.840650E-01 | loss scale: 1.0 | grad norm: 1.106 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-27 14:56:19] iteration       80/    1000 | consumed samples:         5120 | elapsed time per iteration (ms): 87527.5 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.967946E-06 | global batch size:    64 | lm loss: 7.118856E-01 | loss scale: 1.0 | grad norm: 0.804 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
 [2024-11-27 14:57:29] iteration       81/    1000 | consumed samples:         5184 | elapsed time per iteration (ms): 70098.8 | throughput per GPU (TFLOP/s/GPU): 110.0 | learning rate: 4.966654E-06 | global batch size:    64 | lm loss: 8.833607E-01 | loss scale: 1.0 | grad norm: 1.238 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 14:58:50] iteration       82/    1000 | consumed samples:         5248 | elapsed time per iteration (ms): 81244.6 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 4.965337E-06 | global batch size:    64 | lm loss: 6.820184E-01 | loss scale: 1.0 | grad norm: 0.915 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-27 15:00:24] iteration       83/    1000 | consumed samples:         5312 | elapsed time per iteration (ms): 93369.2 | throughput per GPU (TFLOP/s/GPU): 82.6 | learning rate: 4.963994E-06 | global batch size:    64 | lm loss: 7.110070E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 15:01:46] iteration       84/    1000 | consumed samples:         5376 | elapsed time per iteration (ms): 81995.1 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 4.962626E-06 | global batch size:    64 | lm loss: 7.219861E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
 [2024-11-27 15:03:32] iteration       85/    1000 | consumed samples:         5440 | elapsed time per iteration (ms): 106733.3 | throughput per GPU (TFLOP/s/GPU): 72.2 | learning rate: 4.961232E-06 | global batch size:    64 | lm loss: 6.973246E-01 | loss scale: 1.0 | grad norm: 0.815 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 15:04:48] iteration       86/    1000 | consumed samples:         5504 | elapsed time per iteration (ms): 75544.6 | throughput per GPU (TFLOP/s/GPU): 102.0 | learning rate: 4.959814E-06 | global batch size:    64 | lm loss: 6.851219E-01 | loss scale: 1.0 | grad norm: 0.754 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
 [2024-11-27 15:06:04] iteration       87/    1000 | consumed samples:         5568 | elapsed time per iteration (ms): 76222.9 | throughput per GPU (TFLOP/s/GPU): 101.1 | learning rate: 4.958370E-06 | global batch size:    64 | lm loss: 8.057975E-01 | loss scale: 1.0 | grad norm: 1.777 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-27 15:07:34] iteration       88/    1000 | consumed samples:         5632 | elapsed time per iteration (ms): 90322.7 | throughput per GPU (TFLOP/s/GPU): 85.3 | learning rate: 4.956901E-06 | global batch size:    64 | lm loss: 7.231341E-01 | loss scale: 1.0 | grad norm: 1.386 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 15:08:55] iteration       89/    1000 | consumed samples:         5696 | elapsed time per iteration (ms): 80229.7 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 4.955406E-06 | global batch size:    64 | lm loss: 7.316175E-01 | loss scale: 1.0 | grad norm: 1.309 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-27 15:10:08] iteration       90/    1000 | consumed samples:         5760 | elapsed time per iteration (ms): 73185.2 | throughput per GPU (TFLOP/s/GPU): 105.3 | learning rate: 4.953887E-06 | global batch size:    64 | lm loss: 6.497885E-01 | loss scale: 1.0 | grad norm: 0.729 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 15:11:13] iteration       91/    1000 | consumed samples:         5824 | elapsed time per iteration (ms): 64640.5 | throughput per GPU (TFLOP/s/GPU): 119.3 | learning rate: 4.952342E-06 | global batch size:    64 | lm loss: 6.669019E-01 | loss scale: 1.0 | grad norm: 0.894 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
 [2024-11-27 15:12:37] iteration       92/    1000 | consumed samples:         5888 | elapsed time per iteration (ms): 84408.5 | throughput per GPU (TFLOP/s/GPU): 91.3 | learning rate: 4.950772E-06 | global batch size:    64 | lm loss: 7.155092E-01 | loss scale: 1.0 | grad norm: 0.800 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-27 15:13:55] iteration       93/    1000 | consumed samples:         5952 | elapsed time per iteration (ms): 78195.1 | throughput per GPU (TFLOP/s/GPU): 98.6 | learning rate: 4.949176E-06 | global batch size:    64 | lm loss: 6.895306E-01 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 15:15:17] iteration       94/    1000 | consumed samples:         6016 | elapsed time per iteration (ms): 82313.0 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 4.947556E-06 | global batch size:    64 | lm loss: 7.441191E-01 | loss scale: 1.0 | grad norm: 33.727 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 15:16:42] iteration       95/    1000 | consumed samples:         6080 | elapsed time per iteration (ms): 84126.4 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 4.945910E-06 | global batch size:    64 | lm loss: 7.333177E-01 | loss scale: 1.0 | grad norm: 0.825 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-27 15:18:12] iteration       96/    1000 | consumed samples:         6144 | elapsed time per iteration (ms): 90870.2 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 4.944240E-06 | global batch size:    64 | lm loss: 7.676827E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 15:19:41] iteration       97/    1000 | consumed samples:         6208 | elapsed time per iteration (ms): 88465.1 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 4.942544E-06 | global batch size:    64 | lm loss: 7.284559E-01 | loss scale: 1.0 | grad norm: 0.784 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 15:21:07] iteration       98/    1000 | consumed samples:         6272 | elapsed time per iteration (ms): 85811.5 | throughput per GPU (TFLOP/s/GPU): 89.8 | learning rate: 4.940823E-06 | global batch size:    64 | lm loss: 6.762350E-01 | loss scale: 1.0 | grad norm: 0.800 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 15:22:32] iteration       99/    1000 | consumed samples:         6336 | elapsed time per iteration (ms): 85430.6 | throughput per GPU (TFLOP/s/GPU): 90.2 | learning rate: 4.939077E-06 | global batch size:    64 | lm loss: 7.216333E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555ded4ed500] mmco: unref short failure
[h264 @ 0x555ded4ed500] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 15:24:10] iteration      100/    1000 | consumed samples:         6400 | elapsed time per iteration (ms): 97707.6 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 4.937306E-06 | global batch size:    64 | lm loss: 7.216407E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (204044.53, 204044.86)
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 15:29:05] iteration      101/    1000 | consumed samples:         6464 | elapsed time per iteration (ms): 91260.4 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 4.935510E-06 | global batch size:    64 | lm loss: 6.965908E-01 | loss scale: 1.0 | grad norm: 0.889 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-27 15:31:09] iteration      102/    1000 | consumed samples:         6528 | elapsed time per iteration (ms): 123298.8 | throughput per GPU (TFLOP/s/GPU): 62.5 | learning rate: 4.933689E-06 | global batch size:    64 | lm loss: 7.452544E-01 | loss scale: 1.0 | grad norm: 1.219 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 15:32:34] iteration      103/    1000 | consumed samples:         6592 | elapsed time per iteration (ms): 85714.0 | throughput per GPU (TFLOP/s/GPU): 89.9 | learning rate: 4.931842E-06 | global batch size:    64 | lm loss: 6.922635E-01 | loss scale: 1.0 | grad norm: 0.806 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-27 15:34:00] iteration      104/    1000 | consumed samples:         6656 | elapsed time per iteration (ms): 85261.0 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 4.929971E-06 | global batch size:    64 | lm loss: 6.881779E-01 | loss scale: 1.0 | grad norm: 0.936 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 15:35:39] iteration      105/    1000 | consumed samples:         6720 | elapsed time per iteration (ms): 99837.9 | throughput per GPU (TFLOP/s/GPU): 77.2 | learning rate: 4.928075E-06 | global batch size:    64 | lm loss: 7.913043E-01 | loss scale: 1.0 | grad norm: 16.914 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-27 15:37:17] iteration      106/    1000 | consumed samples:         6784 | elapsed time per iteration (ms): 97721.0 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 4.926154E-06 | global batch size:    64 | lm loss: 7.194221E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-27 15:38:50] iteration      107/    1000 | consumed samples:         6848 | elapsed time per iteration (ms): 92398.7 | throughput per GPU (TFLOP/s/GPU): 83.4 | learning rate: 4.924208E-06 | global batch size:    64 | lm loss: 7.140918E-01 | loss scale: 1.0 | grad norm: 0.849 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 15:40:03] iteration      108/    1000 | consumed samples:         6912 | elapsed time per iteration (ms): 73276.4 | throughput per GPU (TFLOP/s/GPU): 105.2 | learning rate: 4.922237E-06 | global batch size:    64 | lm loss: 6.261149E-01 | loss scale: 1.0 | grad norm: 0.789 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
 [2024-11-27 15:41:20] iteration      109/    1000 | consumed samples:         6976 | elapsed time per iteration (ms): 76692.5 | throughput per GPU (TFLOP/s/GPU): 100.5 | learning rate: 4.920242E-06 | global batch size:    64 | lm loss: 6.905310E-01 | loss scale: 1.0 | grad norm: 0.773 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 15:42:40] iteration      110/    1000 | consumed samples:         7040 | elapsed time per iteration (ms): 79999.4 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 4.918221E-06 | global batch size:    64 | lm loss: 7.688470E-01 | loss scale: 1.0 | grad norm: 238.888 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-27 15:43:52] iteration      111/    1000 | consumed samples:         7104 | elapsed time per iteration (ms): 72378.0 | throughput per GPU (TFLOP/s/GPU): 106.5 | learning rate: 4.916176E-06 | global batch size:    64 | lm loss: 7.495630E-01 | loss scale: 1.0 | grad norm: 0.807 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 15:45:23] iteration      112/    1000 | consumed samples:         7168 | elapsed time per iteration (ms): 90592.2 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.914105E-06 | global batch size:    64 | lm loss: 6.877882E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 15:46:40] iteration      113/    1000 | consumed samples:         7232 | elapsed time per iteration (ms): 77005.3 | throughput per GPU (TFLOP/s/GPU): 100.1 | learning rate: 4.912010E-06 | global batch size:    64 | lm loss: 6.341010E-01 | loss scale: 1.0 | grad norm: 0.895 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 15:48:20] iteration      114/    1000 | consumed samples:         7296 | elapsed time per iteration (ms): 100122.3 | throughput per GPU (TFLOP/s/GPU): 77.0 | learning rate: 4.909890E-06 | global batch size:    64 | lm loss: 6.689736E-01 | loss scale: 1.0 | grad norm: 0.783 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 15:49:36] iteration      115/    1000 | consumed samples:         7360 | elapsed time per iteration (ms): 76015.8 | throughput per GPU (TFLOP/s/GPU): 101.4 | learning rate: 4.907746E-06 | global batch size:    64 | lm loss: 7.142720E-01 | loss scale: 1.0 | grad norm: 2.804 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 15:51:09] iteration      116/    1000 | consumed samples:         7424 | elapsed time per iteration (ms): 93186.2 | throughput per GPU (TFLOP/s/GPU): 82.7 | learning rate: 4.905577E-06 | global batch size:    64 | lm loss: 7.067767E-01 | loss scale: 1.0 | grad norm: 0.931 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
 [2024-11-27 15:52:52] iteration      117/    1000 | consumed samples:         7488 | elapsed time per iteration (ms): 102965.8 | throughput per GPU (TFLOP/s/GPU): 74.9 | learning rate: 4.903383E-06 | global batch size:    64 | lm loss: 7.815012E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-27 15:54:06] iteration      118/    1000 | consumed samples:         7552 | elapsed time per iteration (ms): 73738.2 | throughput per GPU (TFLOP/s/GPU): 104.5 | learning rate: 4.901164E-06 | global batch size:    64 | lm loss: 7.462224E-01 | loss scale: 1.0 | grad norm: 1.016 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-27 15:55:31] iteration      119/    1000 | consumed samples:         7616 | elapsed time per iteration (ms): 85789.3 | throughput per GPU (TFLOP/s/GPU): 89.9 | learning rate: 4.898920E-06 | global batch size:    64 | lm loss: 6.693496E-01 | loss scale: 1.0 | grad norm: 0.775 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 15:56:58] iteration      120/    1000 | consumed samples:         7680 | elapsed time per iteration (ms): 86049.7 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 4.896652E-06 | global batch size:    64 | lm loss: 6.334546E-01 | loss scale: 1.0 | grad norm: 0.820 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
 [2024-11-27 16:00:27] iteration      121/    1000 | consumed samples:         7744 | elapsed time per iteration (ms): 209154.2 | throughput per GPU (TFLOP/s/GPU): 36.9 | learning rate: 4.894360E-06 | global batch size:    64 | lm loss: 6.660936E-01 | loss scale: 1.0 | grad norm: 1.144 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 16:02:16] iteration      122/    1000 | consumed samples:         7808 | elapsed time per iteration (ms): 109483.9 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 4.892043E-06 | global batch size:    64 | lm loss: 6.861758E-01 | loss scale: 1.0 | grad norm: 0.732 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 16:03:48] iteration      123/    1000 | consumed samples:         7872 | elapsed time per iteration (ms): 92055.5 | throughput per GPU (TFLOP/s/GPU): 83.7 | learning rate: 4.889701E-06 | global batch size:    64 | lm loss: 7.023684E-01 | loss scale: 1.0 | grad norm: 27.807 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 16:05:44] iteration      124/    1000 | consumed samples:         7936 | elapsed time per iteration (ms): 115893.3 | throughput per GPU (TFLOP/s/GPU): 66.5 | learning rate: 4.887334E-06 | global batch size:    64 | lm loss: 7.402400E-01 | loss scale: 1.0 | grad norm: 1.045 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
 [2024-11-27 16:07:07] iteration      125/    1000 | consumed samples:         8000 | elapsed time per iteration (ms): 83072.7 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 4.884944E-06 | global batch size:    64 | lm loss: 7.056447E-01 | loss scale: 1.0 | grad norm: 0.763 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 16:08:45] iteration      126/    1000 | consumed samples:         8064 | elapsed time per iteration (ms): 97614.9 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 4.882528E-06 | global batch size:    64 | lm loss: 7.042141E-01 | loss scale: 1.0 | grad norm: 1.142 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955acf300] [h264 @ 0x555dedfc6580] mmco: unref short failure
mmco: unref short failure
[h264 @ 0x55d955acf300] [h264 @ 0x555dedfc6580] mmco: unref short failure
mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 16:10:15] iteration      127/    1000 | consumed samples:         8128 | elapsed time per iteration (ms): 89749.7 | throughput per GPU (TFLOP/s/GPU): 85.9 | learning rate: 4.880088E-06 | global batch size:    64 | lm loss: 7.120880E-01 | loss scale: 1.0 | grad norm: 0.859 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
 [2024-11-27 16:11:50] iteration      128/    1000 | consumed samples:         8192 | elapsed time per iteration (ms): 95187.7 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 4.877624E-06 | global batch size:    64 | lm loss: 6.666554E-01 | loss scale: 1.0 | grad norm: 1.103 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 16:13:19] iteration      129/    1000 | consumed samples:         8256 | elapsed time per iteration (ms): 89302.8 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 4.875136E-06 | global batch size:    64 | lm loss: 6.939200E-01 | loss scale: 1.0 | grad norm: 0.925 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95cbcf940] mmco: unref short failure
[h264 @ 0x55d95cbcf940] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 16:14:34] iteration      130/    1000 | consumed samples:         8320 | elapsed time per iteration (ms): 75267.7 | throughput per GPU (TFLOP/s/GPU): 102.4 | learning rate: 4.872622E-06 | global batch size:    64 | lm loss: 6.710307E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 16:16:04] iteration      131/    1000 | consumed samples:         8384 | elapsed time per iteration (ms): 89805.7 | throughput per GPU (TFLOP/s/GPU): 85.8 | learning rate: 4.870085E-06 | global batch size:    64 | lm loss: 6.678811E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-27 16:18:09] iteration      132/    1000 | consumed samples:         8448 | elapsed time per iteration (ms): 124949.2 | throughput per GPU (TFLOP/s/GPU): 61.7 | learning rate: 4.867523E-06 | global batch size:    64 | lm loss: 7.045669E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 16:19:25] iteration      133/    1000 | consumed samples:         8512 | elapsed time per iteration (ms): 75527.8 | throughput per GPU (TFLOP/s/GPU): 102.1 | learning rate: 4.864937E-06 | global batch size:    64 | lm loss: 6.984726E-01 | loss scale: 1.0 | grad norm: 1.503 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
 [2024-11-27 16:21:20] iteration      134/    1000 | consumed samples:         8576 | elapsed time per iteration (ms): 115224.9 | throughput per GPU (TFLOP/s/GPU): 66.9 | learning rate: 4.862327E-06 | global batch size:    64 | lm loss: 7.181825E-01 | loss scale: 1.0 | grad norm: 0.871 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 16:23:06] iteration      135/    1000 | consumed samples:         8640 | elapsed time per iteration (ms): 106079.9 | throughput per GPU (TFLOP/s/GPU): 72.7 | learning rate: 4.859692E-06 | global batch size:    64 | lm loss: 7.264897E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 16:24:21] iteration      136/    1000 | consumed samples:         8704 | elapsed time per iteration (ms): 75408.5 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 4.857033E-06 | global batch size:    64 | lm loss: 6.576435E-01 | loss scale: 1.0 | grad norm: 0.825 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 500 unjoint_samples 500 joint_samples 35 [105962, 100832]
processed_samples 500 unjoint_samples 500 joint_samples 35 [105962, 100832]
processed_samples 500 unjoint_samples 500 joint_samples 33 [20193, 118380]
processed_samples 500 unjoint_samples 500 joint_samples 33 [20193, 118380]
processed_samples 500 unjoint_samples 500 joint_samples 29 [106623, 110615]
processed_samples 500 unjoint_samples 500 joint_samples 29 [106623, 110615]
processed_samples 500 unjoint_samples 500 joint_samples 39 [92083, 124515]
processed_samples 500 unjoint_samples 500 joint_samples 39 [92083, 124515]
processed_samples 500 unjoint_samples 500 joint_samples 31 [119429, 119622]
processed_samples 500 unjoint_samples 500 joint_samples 31 [119429, 119622]
processed_samples 500 unjoint_samples 500 joint_samples 34 [121945, 117011]
processed_samples 500 unjoint_samples 500 joint_samples 34 [121945, 117011]
processed_samples 500 unjoint_samples 500 joint_samples 33 [106663, 103375]
processed_samples 500 unjoint_samples 500 joint_samples 33 [106663, 103375]
processed_samples 500 unjoint_samples 500 joint_samples 38 [63985, 122137]
processed_samples 500 unjoint_samples 500 joint_samples 38 [63985, 122137]
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 16:25:45] iteration      137/    1000 | consumed samples:         8768 | elapsed time per iteration (ms): 83158.4 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 4.854350E-06 | global batch size:    64 | lm loss: 7.173734E-01 | loss scale: 1.0 | grad norm: 0.760 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 16:27:18] iteration      138/    1000 | consumed samples:         8832 | elapsed time per iteration (ms): 92897.0 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 4.851643E-06 | global batch size:    64 | lm loss: 6.300446E-01 | loss scale: 1.0 | grad norm: 2.823 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 16:28:34] iteration      139/    1000 | consumed samples:         8896 | elapsed time per iteration (ms): 76658.4 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 4.848912E-06 | global batch size:    64 | lm loss: 6.692375E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 16:30:17] iteration      140/    1000 | consumed samples:         8960 | elapsed time per iteration (ms): 102415.8 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 4.846156E-06 | global batch size:    64 | lm loss: 6.938949E-01 | loss scale: 1.0 | grad norm: 1.022 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 16:31:41] iteration      141/    1000 | consumed samples:         9024 | elapsed time per iteration (ms): 84222.7 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 4.843377E-06 | global batch size:    64 | lm loss: 7.084374E-01 | loss scale: 1.0 | grad norm: 0.882 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
 [2024-11-27 16:33:20] iteration      142/    1000 | consumed samples:         9088 | elapsed time per iteration (ms): 99273.9 | throughput per GPU (TFLOP/s/GPU): 77.6 | learning rate: 4.840573E-06 | global batch size:    64 | lm loss: 6.900647E-01 | loss scale: 1.0 | grad norm: 0.764 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 16:35:40] iteration      143/    1000 | consumed samples:         9152 | elapsed time per iteration (ms): 139672.6 | throughput per GPU (TFLOP/s/GPU): 55.2 | learning rate: 4.837746E-06 | global batch size:    64 | lm loss: 7.882947E-01 | loss scale: 1.0 | grad norm: 0.916 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 16:37:20] iteration      144/    1000 | consumed samples:         9216 | elapsed time per iteration (ms): 99927.0 | throughput per GPU (TFLOP/s/GPU): 77.1 | learning rate: 4.834894E-06 | global batch size:    64 | lm loss: 6.919404E-01 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-27 16:38:31] iteration      145/    1000 | consumed samples:         9280 | elapsed time per iteration (ms): 71577.1 | throughput per GPU (TFLOP/s/GPU): 107.7 | learning rate: 4.832018E-06 | global batch size:    64 | lm loss: 7.083249E-01 | loss scale: 1.0 | grad norm: 0.931 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 16:39:49] iteration      146/    1000 | consumed samples:         9344 | elapsed time per iteration (ms): 77590.9 | throughput per GPU (TFLOP/s/GPU): 99.3 | learning rate: 4.829119E-06 | global batch size:    64 | lm loss: 6.647974E-01 | loss scale: 1.0 | grad norm: 0.815 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 16:41:27] iteration      147/    1000 | consumed samples:         9408 | elapsed time per iteration (ms): 97953.2 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 4.826195E-06 | global batch size:    64 | lm loss: 6.562359E-01 | loss scale: 1.0 | grad norm: 1.320 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-27 16:42:54] iteration      148/    1000 | consumed samples:         9472 | elapsed time per iteration (ms): 87479.2 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.823248E-06 | global batch size:    64 | lm loss: 6.847668E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-27 16:44:16] iteration      149/    1000 | consumed samples:         9536 | elapsed time per iteration (ms): 81834.8 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 4.820277E-06 | global batch size:    64 | lm loss: 6.775054E-01 | loss scale: 1.0 | grad norm: 3.057 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9568d0900] [h264 @ 0x555defd60880] mmco: unref short failure
mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 16:45:27] iteration      150/    1000 | consumed samples:         9600 | elapsed time per iteration (ms): 70827.7 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 4.817282E-06 | global batch size:    64 | lm loss: 7.231870E-01 | loss scale: 1.0 | grad norm: 0.991 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
 [2024-11-27 16:47:26] iteration      151/    1000 | consumed samples:         9664 | elapsed time per iteration (ms): 119267.8 | throughput per GPU (TFLOP/s/GPU): 64.6 | learning rate: 4.814263E-06 | global batch size:    64 | lm loss: 6.249514E-01 | loss scale: 1.0 | grad norm: 1.199 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 16:48:42] iteration      152/    1000 | consumed samples:         9728 | elapsed time per iteration (ms): 75657.6 | throughput per GPU (TFLOP/s/GPU): 101.9 | learning rate: 4.811221E-06 | global batch size:    64 | lm loss: 7.830349E-01 | loss scale: 1.0 | grad norm: 3.815 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 16:50:09] iteration      153/    1000 | consumed samples:         9792 | elapsed time per iteration (ms): 86625.1 | throughput per GPU (TFLOP/s/GPU): 89.0 | learning rate: 4.808155E-06 | global batch size:    64 | lm loss: 6.208155E-01 | loss scale: 1.0 | grad norm: 0.803 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 16:51:29] iteration      154/    1000 | consumed samples:         9856 | elapsed time per iteration (ms): 79964.0 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 4.805065E-06 | global batch size:    64 | lm loss: 7.659938E-01 | loss scale: 1.0 | grad norm: 0.992 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
 [2024-11-27 16:53:04] iteration      155/    1000 | consumed samples:         9920 | elapsed time per iteration (ms): 95631.4 | throughput per GPU (TFLOP/s/GPU): 80.6 | learning rate: 4.801951E-06 | global batch size:    64 | lm loss: 6.927797E-01 | loss scale: 1.0 | grad norm: 0.983 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 16:54:17] iteration      156/    1000 | consumed samples:         9984 | elapsed time per iteration (ms): 72721.3 | throughput per GPU (TFLOP/s/GPU): 106.0 | learning rate: 4.798814E-06 | global batch size:    64 | lm loss: 7.079293E-01 | loss scale: 1.0 | grad norm: 8.416 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-27 16:55:37] iteration      157/    1000 | consumed samples:        10048 | elapsed time per iteration (ms): 79895.5 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 4.795653E-06 | global batch size:    64 | lm loss: 6.731150E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
 [2024-11-27 16:57:34] iteration      158/    1000 | consumed samples:        10112 | elapsed time per iteration (ms): 116763.1 | throughput per GPU (TFLOP/s/GPU): 66.0 | learning rate: 4.792469E-06 | global batch size:    64 | lm loss: 7.790256E-01 | loss scale: 1.0 | grad norm: 0.822 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
 [2024-11-27 16:59:25] iteration      159/    1000 | consumed samples:        10176 | elapsed time per iteration (ms): 111649.0 | throughput per GPU (TFLOP/s/GPU): 69.0 | learning rate: 4.789261E-06 | global batch size:    64 | lm loss: 7.043136E-01 | loss scale: 1.0 | grad norm: 0.877 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
 [2024-11-27 17:01:01] iteration      160/    1000 | consumed samples:        10240 | elapsed time per iteration (ms): 96076.4 | throughput per GPU (TFLOP/s/GPU): 80.2 | learning rate: 4.786030E-06 | global batch size:    64 | lm loss: 6.699425E-01 | loss scale: 1.0 | grad norm: 0.764 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-27 17:02:47] iteration      161/    1000 | consumed samples:        10304 | elapsed time per iteration (ms): 105622.4 | throughput per GPU (TFLOP/s/GPU): 73.0 | learning rate: 4.782775E-06 | global batch size:    64 | lm loss: 6.520627E-01 | loss scale: 1.0 | grad norm: 1.032 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
 [2024-11-27 17:04:19] iteration      162/    1000 | consumed samples:        10368 | elapsed time per iteration (ms): 91872.5 | throughput per GPU (TFLOP/s/GPU): 83.9 | learning rate: 4.779497E-06 | global batch size:    64 | lm loss: 7.085309E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-27 17:05:43] iteration      163/    1000 | consumed samples:        10432 | elapsed time per iteration (ms): 83853.9 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 4.776195E-06 | global batch size:    64 | lm loss: 7.403272E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 17:07:09] iteration      164/    1000 | consumed samples:        10496 | elapsed time per iteration (ms): 85988.2 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 4.772870E-06 | global batch size:    64 | lm loss: 7.570689E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 17:09:03] iteration      165/    1000 | consumed samples:        10560 | elapsed time per iteration (ms): 114128.6 | throughput per GPU (TFLOP/s/GPU): 67.5 | learning rate: 4.769522E-06 | global batch size:    64 | lm loss: 6.929017E-01 | loss scale: 1.0 | grad norm: 1.047 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-27 17:10:26] iteration      166/    1000 | consumed samples:        10624 | elapsed time per iteration (ms): 82536.4 | throughput per GPU (TFLOP/s/GPU): 93.4 | learning rate: 4.766150E-06 | global batch size:    64 | lm loss: 6.568977E-01 | loss scale: 1.0 | grad norm: 0.917 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-27 17:11:55] iteration      167/    1000 | consumed samples:        10688 | elapsed time per iteration (ms): 89932.8 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 4.762755E-06 | global batch size:    64 | lm loss: 7.344331E-01 | loss scale: 1.0 | grad norm: 1.168 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 17:13:22] iteration      168/    1000 | consumed samples:        10752 | elapsed time per iteration (ms): 86819.8 | throughput per GPU (TFLOP/s/GPU): 88.8 | learning rate: 4.759337E-06 | global batch size:    64 | lm loss: 7.115574E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 17:14:44] iteration      169/    1000 | consumed samples:        10816 | elapsed time per iteration (ms): 81933.9 | throughput per GPU (TFLOP/s/GPU): 94.1 | learning rate: 4.755896E-06 | global batch size:    64 | lm loss: 7.456884E-01 | loss scale: 1.0 | grad norm: 0.996 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 17:16:22] iteration      170/    1000 | consumed samples:        10880 | elapsed time per iteration (ms): 97941.6 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 4.752432E-06 | global batch size:    64 | lm loss: 7.081493E-01 | loss scale: 1.0 | grad norm: 0.980 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 17:19:09] iteration      171/    1000 | consumed samples:        10944 | elapsed time per iteration (ms): 166384.3 | throughput per GPU (TFLOP/s/GPU): 46.3 | learning rate: 4.748944E-06 | global batch size:    64 | lm loss: 6.956521E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-27 17:20:29] iteration      172/    1000 | consumed samples:        11008 | elapsed time per iteration (ms): 80851.7 | throughput per GPU (TFLOP/s/GPU): 95.3 | learning rate: 4.745434E-06 | global batch size:    64 | lm loss: 7.245098E-01 | loss scale: 1.0 | grad norm: 1.193 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
 [2024-11-27 17:21:57] iteration      173/    1000 | consumed samples:        11072 | elapsed time per iteration (ms): 87496.9 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.741900E-06 | global batch size:    64 | lm loss: 7.148576E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
 [2024-11-27 17:23:19] iteration      174/    1000 | consumed samples:        11136 | elapsed time per iteration (ms): 81798.7 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 4.738344E-06 | global batch size:    64 | lm loss: 7.298409E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 17:25:03] iteration      175/    1000 | consumed samples:        11200 | elapsed time per iteration (ms): 104541.5 | throughput per GPU (TFLOP/s/GPU): 73.7 | learning rate: 4.734764E-06 | global batch size:    64 | lm loss: 7.137690E-01 | loss scale: 1.0 | grad norm: 0.988 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 17:26:48] iteration      176/    1000 | consumed samples:        11264 | elapsed time per iteration (ms): 104206.7 | throughput per GPU (TFLOP/s/GPU): 74.0 | learning rate: 4.731162E-06 | global batch size:    64 | lm loss: 6.374616E-01 | loss scale: 1.0 | grad norm: 0.791 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 17:28:10] iteration      177/    1000 | consumed samples:        11328 | elapsed time per iteration (ms): 82484.2 | throughput per GPU (TFLOP/s/GPU): 93.5 | learning rate: 4.727537E-06 | global batch size:    64 | lm loss: 6.290931E-01 | loss scale: 1.0 | grad norm: 0.993 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
 [2024-11-27 17:29:48] iteration      178/    1000 | consumed samples:        11392 | elapsed time per iteration (ms): 98160.7 | throughput per GPU (TFLOP/s/GPU): 78.5 | learning rate: 4.723889E-06 | global batch size:    64 | lm loss: 6.425977E-01 | loss scale: 1.0 | grad norm: 0.863 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
 [2024-11-27 17:31:09] iteration      179/    1000 | consumed samples:        11456 | elapsed time per iteration (ms): 80280.2 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 4.720218E-06 | global batch size:    64 | lm loss: 7.326845E-01 | loss scale: 1.0 | grad norm: 1.031 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-27 17:32:37] iteration      180/    1000 | consumed samples:        11520 | elapsed time per iteration (ms): 88866.5 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 4.716524E-06 | global batch size:    64 | lm loss: 6.877599E-01 | loss scale: 1.0 | grad norm: 0.889 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 17:34:02] iteration      181/    1000 | consumed samples:        11584 | elapsed time per iteration (ms): 84486.9 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 4.712808E-06 | global batch size:    64 | lm loss: 6.831203E-01 | loss scale: 1.0 | grad norm: 0.788 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 17:35:22] iteration      182/    1000 | consumed samples:        11648 | elapsed time per iteration (ms): 80280.8 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 4.709068E-06 | global batch size:    64 | lm loss: 7.477552E-01 | loss scale: 1.0 | grad norm: 0.931 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 17:36:37] iteration      183/    1000 | consumed samples:        11712 | elapsed time per iteration (ms): 74934.2 | throughput per GPU (TFLOP/s/GPU): 102.9 | learning rate: 4.705307E-06 | global batch size:    64 | lm loss: 6.492839E-01 | loss scale: 1.0 | grad norm: 0.789 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
 [2024-11-27 17:38:25] iteration      184/    1000 | consumed samples:        11776 | elapsed time per iteration (ms): 108061.1 | throughput per GPU (TFLOP/s/GPU): 71.3 | learning rate: 4.701522E-06 | global batch size:    64 | lm loss: 6.642103E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-27 17:39:58] iteration      185/    1000 | consumed samples:        11840 | elapsed time per iteration (ms): 92756.7 | throughput per GPU (TFLOP/s/GPU): 83.1 | learning rate: 4.697715E-06 | global batch size:    64 | lm loss: 6.904918E-01 | loss scale: 1.0 | grad norm: 1.049 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 17:41:22] iteration      186/    1000 | consumed samples:        11904 | elapsed time per iteration (ms): 83547.8 | throughput per GPU (TFLOP/s/GPU): 92.3 | learning rate: 4.693886E-06 | global batch size:    64 | lm loss: 7.127905E-01 | loss scale: 1.0 | grad norm: 0.798 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 17:43:29] iteration      187/    1000 | consumed samples:        11968 | elapsed time per iteration (ms): 127045.6 | throughput per GPU (TFLOP/s/GPU): 60.7 | learning rate: 4.690034E-06 | global batch size:    64 | lm loss: 7.377602E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-27 17:44:43] iteration      188/    1000 | consumed samples:        12032 | elapsed time per iteration (ms): 74238.4 | throughput per GPU (TFLOP/s/GPU): 103.8 | learning rate: 4.686160E-06 | global batch size:    64 | lm loss: 6.513256E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
 [2024-11-27 17:46:07] iteration      189/    1000 | consumed samples:        12096 | elapsed time per iteration (ms): 84273.9 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 4.682263E-06 | global batch size:    64 | lm loss: 6.903006E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
 [2024-11-27 17:47:21] iteration      190/    1000 | consumed samples:        12160 | elapsed time per iteration (ms): 74023.6 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 4.678344E-06 | global batch size:    64 | lm loss: 6.182513E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-27 17:48:44] iteration      191/    1000 | consumed samples:        12224 | elapsed time per iteration (ms): 82835.6 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 4.674402E-06 | global batch size:    64 | lm loss: 7.302966E-01 | loss scale: 1.0 | grad norm: 1.009 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
 [2024-11-27 17:50:13] iteration      192/    1000 | consumed samples:        12288 | elapsed time per iteration (ms): 88892.2 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 4.670439E-06 | global batch size:    64 | lm loss: 7.525570E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 17:51:53] iteration      193/    1000 | consumed samples:        12352 | elapsed time per iteration (ms): 100304.3 | throughput per GPU (TFLOP/s/GPU): 76.9 | learning rate: 4.666453E-06 | global batch size:    64 | lm loss: 7.345333E-01 | loss scale: 1.0 | grad norm: 0.944 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 17:53:19] iteration      194/    1000 | consumed samples:        12416 | elapsed time per iteration (ms): 86240.2 | throughput per GPU (TFLOP/s/GPU): 89.4 | learning rate: 4.662444E-06 | global batch size:    64 | lm loss: 7.116587E-01 | loss scale: 1.0 | grad norm: 1.096 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 17:54:53] iteration      195/    1000 | consumed samples:        12480 | elapsed time per iteration (ms): 93884.9 | throughput per GPU (TFLOP/s/GPU): 82.1 | learning rate: 4.658414E-06 | global batch size:    64 | lm loss: 7.222701E-01 | loss scale: 1.0 | grad norm: 0.918 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-27 17:56:14] iteration      196/    1000 | consumed samples:        12544 | elapsed time per iteration (ms): 80589.4 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 4.654361E-06 | global batch size:    64 | lm loss: 6.828011E-01 | loss scale: 1.0 | grad norm: 1.116 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-27 17:57:31] iteration      197/    1000 | consumed samples:        12608 | elapsed time per iteration (ms): 76912.4 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 4.650287E-06 | global batch size:    64 | lm loss: 6.875466E-01 | loss scale: 1.0 | grad norm: 1.037 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 17:58:54] iteration      198/    1000 | consumed samples:        12672 | elapsed time per iteration (ms): 83182.9 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 4.646190E-06 | global batch size:    64 | lm loss: 7.020935E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
 [2024-11-27 18:00:11] iteration      199/    1000 | consumed samples:        12736 | elapsed time per iteration (ms): 77283.6 | throughput per GPU (TFLOP/s/GPU): 99.7 | learning rate: 4.642072E-06 | global batch size:    64 | lm loss: 7.617804E-01 | loss scale: 1.0 | grad norm: 1.135 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-27 18:01:18] iteration      200/    1000 | consumed samples:        12800 | elapsed time per iteration (ms): 66427.1 | throughput per GPU (TFLOP/s/GPU): 116.0 | learning rate: 4.637931E-06 | global batch size:    64 | lm loss: 6.670317E-01 | loss scale: 1.0 | grad norm: 1.136 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (209629.20, 209629.58)
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
 [2024-11-27 18:06:03] iteration      201/    1000 | consumed samples:        12864 | elapsed time per iteration (ms): 75198.8 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 4.633769E-06 | global batch size:    64 | lm loss: 7.316583E-01 | loss scale: 1.0 | grad norm: 0.879 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
 [2024-11-27 18:07:42] iteration      202/    1000 | consumed samples:        12928 | elapsed time per iteration (ms): 99254.2 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 4.629585E-06 | global batch size:    64 | lm loss: 6.787452E-01 | loss scale: 1.0 | grad norm: 0.797 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-27 18:09:17] iteration      203/    1000 | consumed samples:        12992 | elapsed time per iteration (ms): 94646.6 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 4.625378E-06 | global batch size:    64 | lm loss: 7.176006E-01 | loss scale: 1.0 | grad norm: 1.155 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-27 18:10:40] iteration      204/    1000 | consumed samples:        13056 | elapsed time per iteration (ms): 83515.3 | throughput per GPU (TFLOP/s/GPU): 92.3 | learning rate: 4.621151E-06 | global batch size:    64 | lm loss: 7.039618E-01 | loss scale: 1.0 | grad norm: 0.840 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
 [2024-11-27 18:12:17] iteration      205/    1000 | consumed samples:        13120 | elapsed time per iteration (ms): 96650.8 | throughput per GPU (TFLOP/s/GPU): 79.8 | learning rate: 4.616901E-06 | global batch size:    64 | lm loss: 6.275778E-01 | loss scale: 1.0 | grad norm: 1.519 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
 [2024-11-27 18:13:47] iteration      206/    1000 | consumed samples:        13184 | elapsed time per iteration (ms): 89884.3 | throughput per GPU (TFLOP/s/GPU): 85.8 | learning rate: 4.612630E-06 | global batch size:    64 | lm loss: 6.750375E-01 | loss scale: 1.0 | grad norm: 1.124 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
 [2024-11-27 18:15:16] iteration      207/    1000 | consumed samples:        13248 | elapsed time per iteration (ms): 88931.2 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 4.608337E-06 | global batch size:    64 | lm loss: 7.806692E-01 | loss scale: 1.0 | grad norm: 1.076 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 18:16:56] iteration      208/    1000 | consumed samples:        13312 | elapsed time per iteration (ms): 100571.4 | throughput per GPU (TFLOP/s/GPU): 76.6 | learning rate: 4.604022E-06 | global batch size:    64 | lm loss: 6.860654E-01 | loss scale: 1.0 | grad norm: 1.029 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 18:18:18] iteration      209/    1000 | consumed samples:        13376 | elapsed time per iteration (ms): 81771.6 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 4.599686E-06 | global batch size:    64 | lm loss: 7.601146E-01 | loss scale: 1.0 | grad norm: 0.966 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555df0bdeb40] mmco: unref short failure
[h264 @ 0x555df0bdeb40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df0bdeb40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df0bdeb40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df0bdeb40] mmco: unref short failure
[h264 @ 0x555df0bdeb40] mmco: unref short failure
 [2024-11-27 18:19:41] iteration      210/    1000 | consumed samples:        13440 | elapsed time per iteration (ms): 83384.6 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 4.595329E-06 | global batch size:    64 | lm loss: 7.088625E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 18:21:22] iteration      211/    1000 | consumed samples:        13504 | elapsed time per iteration (ms): 100503.4 | throughput per GPU (TFLOP/s/GPU): 76.7 | learning rate: 4.590950E-06 | global batch size:    64 | lm loss: 6.751691E-01 | loss scale: 1.0 | grad norm: 0.915 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-27 18:22:53] iteration      212/    1000 | consumed samples:        13568 | elapsed time per iteration (ms): 90839.6 | throughput per GPU (TFLOP/s/GPU): 84.9 | learning rate: 4.586549E-06 | global batch size:    64 | lm loss: 7.898833E-01 | loss scale: 1.0 | grad norm: 1.115 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
 [2024-11-27 18:24:07] iteration      213/    1000 | consumed samples:        13632 | elapsed time per iteration (ms): 74153.9 | throughput per GPU (TFLOP/s/GPU): 104.0 | learning rate: 4.582128E-06 | global batch size:    64 | lm loss: 6.275224E-01 | loss scale: 1.0 | grad norm: 2.603 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-27 18:25:25] iteration      214/    1000 | consumed samples:        13696 | elapsed time per iteration (ms): 77891.6 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 4.577684E-06 | global batch size:    64 | lm loss: 7.842699E-01 | loss scale: 1.0 | grad norm: 1.162 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
 [2024-11-27 18:26:54] iteration      215/    1000 | consumed samples:        13760 | elapsed time per iteration (ms): 89511.7 | throughput per GPU (TFLOP/s/GPU): 86.1 | learning rate: 4.573220E-06 | global batch size:    64 | lm loss: 7.736768E-01 | loss scale: 1.0 | grad norm: 0.939 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-27 18:28:37] iteration      216/    1000 | consumed samples:        13824 | elapsed time per iteration (ms): 102352.8 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 4.568735E-06 | global batch size:    64 | lm loss: 6.585745E-01 | loss scale: 1.0 | grad norm: 0.947 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 18:30:21] iteration      217/    1000 | consumed samples:        13888 | elapsed time per iteration (ms): 103961.8 | throughput per GPU (TFLOP/s/GPU): 74.1 | learning rate: 4.564228E-06 | global batch size:    64 | lm loss: 6.783351E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-27 18:31:50] iteration      218/    1000 | consumed samples:        13952 | elapsed time per iteration (ms): 89126.9 | throughput per GPU (TFLOP/s/GPU): 86.5 | learning rate: 4.559700E-06 | global batch size:    64 | lm loss: 6.602423E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 18:33:10] iteration      219/    1000 | consumed samples:        14016 | elapsed time per iteration (ms): 80601.5 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 4.555151E-06 | global batch size:    64 | lm loss: 7.091999E-01 | loss scale: 1.0 | grad norm: 0.998 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 18:34:33] iteration      220/    1000 | consumed samples:        14080 | elapsed time per iteration (ms): 82413.8 | throughput per GPU (TFLOP/s/GPU): 93.5 | learning rate: 4.550581E-06 | global batch size:    64 | lm loss: 6.900014E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
 [2024-11-27 18:35:58] iteration      221/    1000 | consumed samples:        14144 | elapsed time per iteration (ms): 84945.4 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 4.545990E-06 | global batch size:    64 | lm loss: 7.463014E-01 | loss scale: 1.0 | grad norm: 0.862 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-27 18:37:37] iteration      222/    1000 | consumed samples:        14208 | elapsed time per iteration (ms): 99477.3 | throughput per GPU (TFLOP/s/GPU): 77.5 | learning rate: 4.541378E-06 | global batch size:    64 | lm loss: 7.625656E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
 [2024-11-27 18:39:09] iteration      223/    1000 | consumed samples:        14272 | elapsed time per iteration (ms): 91591.3 | throughput per GPU (TFLOP/s/GPU): 84.2 | learning rate: 4.536745E-06 | global batch size:    64 | lm loss: 6.243209E-01 | loss scale: 1.0 | grad norm: 1.036 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 18:40:40] iteration      224/    1000 | consumed samples:        14336 | elapsed time per iteration (ms): 90724.3 | throughput per GPU (TFLOP/s/GPU): 85.0 | learning rate: 4.532092E-06 | global batch size:    64 | lm loss: 6.749411E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 18:42:01] iteration      225/    1000 | consumed samples:        14400 | elapsed time per iteration (ms): 81320.9 | throughput per GPU (TFLOP/s/GPU): 94.8 | learning rate: 4.527417E-06 | global batch size:    64 | lm loss: 6.387181E-01 | loss scale: 1.0 | grad norm: 0.752 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 18:43:53] iteration      226/    1000 | consumed samples:        14464 | elapsed time per iteration (ms): 111897.3 | throughput per GPU (TFLOP/s/GPU): 68.9 | learning rate: 4.522722E-06 | global batch size:    64 | lm loss: 6.825730E-01 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
 [2024-11-27 18:45:13] iteration      227/    1000 | consumed samples:        14528 | elapsed time per iteration (ms): 79731.4 | throughput per GPU (TFLOP/s/GPU): 96.7 | learning rate: 4.518006E-06 | global batch size:    64 | lm loss: 6.488119E-01 | loss scale: 1.0 | grad norm: 0.870 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 18:46:50] iteration      228/    1000 | consumed samples:        14592 | elapsed time per iteration (ms): 97086.8 | throughput per GPU (TFLOP/s/GPU): 79.4 | learning rate: 4.513270E-06 | global batch size:    64 | lm loss: 6.314636E-01 | loss scale: 1.0 | grad norm: 1.432 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 18:48:09] iteration      229/    1000 | consumed samples:        14656 | elapsed time per iteration (ms): 79348.7 | throughput per GPU (TFLOP/s/GPU): 97.1 | learning rate: 4.508513E-06 | global batch size:    64 | lm loss: 6.365232E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 18:50:58] iteration      230/    1000 | consumed samples:        14720 | elapsed time per iteration (ms): 168816.7 | throughput per GPU (TFLOP/s/GPU): 45.7 | learning rate: 4.503735E-06 | global batch size:    64 | lm loss: 6.728038E-01 | loss scale: 1.0 | grad norm: 1.116 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
 [2024-11-27 18:52:17] iteration      231/    1000 | consumed samples:        14784 | elapsed time per iteration (ms): 78665.1 | throughput per GPU (TFLOP/s/GPU): 98.0 | learning rate: 4.498937E-06 | global batch size:    64 | lm loss: 6.739997E-01 | loss scale: 1.0 | grad norm: 0.953 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
 [2024-11-27 18:54:14] iteration      232/    1000 | consumed samples:        14848 | elapsed time per iteration (ms): 117310.5 | throughput per GPU (TFLOP/s/GPU): 65.7 | learning rate: 4.494118E-06 | global batch size:    64 | lm loss: 6.592008E-01 | loss scale: 1.0 | grad norm: 1.044 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
 [2024-11-27 18:55:38] iteration      233/    1000 | consumed samples:        14912 | elapsed time per iteration (ms): 84354.7 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 4.489279E-06 | global batch size:    64 | lm loss: 6.571340E-01 | loss scale: 1.0 | grad norm: 0.906 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
 [2024-11-27 18:57:01] iteration      234/    1000 | consumed samples:        14976 | elapsed time per iteration (ms): 82360.2 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 4.484420E-06 | global batch size:    64 | lm loss: 6.976779E-01 | loss scale: 1.0 | grad norm: 0.813 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
 [2024-11-27 18:58:50] iteration      235/    1000 | consumed samples:        15040 | elapsed time per iteration (ms): 109720.4 | throughput per GPU (TFLOP/s/GPU): 70.3 | learning rate: 4.479540E-06 | global batch size:    64 | lm loss: 7.036162E-01 | loss scale: 1.0 | grad norm: 0.975 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 19:00:08] iteration      236/    1000 | consumed samples:        15104 | elapsed time per iteration (ms): 77186.6 | throughput per GPU (TFLOP/s/GPU): 99.9 | learning rate: 4.474640E-06 | global batch size:    64 | lm loss: 6.882964E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 19:01:30] iteration      237/    1000 | consumed samples:        15168 | elapsed time per iteration (ms): 82633.5 | throughput per GPU (TFLOP/s/GPU): 93.3 | learning rate: 4.469720E-06 | global batch size:    64 | lm loss: 6.796645E-01 | loss scale: 1.0 | grad norm: 0.973 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-27 19:02:50] iteration      238/    1000 | consumed samples:        15232 | elapsed time per iteration (ms): 79756.3 | throughput per GPU (TFLOP/s/GPU): 96.7 | learning rate: 4.464780E-06 | global batch size:    64 | lm loss: 6.988075E-01 | loss scale: 1.0 | grad norm: 1.011 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 19:04:03] iteration      239/    1000 | consumed samples:        15296 | elapsed time per iteration (ms): 73122.9 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.459820E-06 | global batch size:    64 | lm loss: 6.563836E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 19:05:27] iteration      240/    1000 | consumed samples:        15360 | elapsed time per iteration (ms): 83360.8 | throughput per GPU (TFLOP/s/GPU): 92.5 | learning rate: 4.454840E-06 | global batch size:    64 | lm loss: 7.006662E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
 [2024-11-27 19:06:56] iteration      241/    1000 | consumed samples:        15424 | elapsed time per iteration (ms): 89125.8 | throughput per GPU (TFLOP/s/GPU): 86.5 | learning rate: 4.449839E-06 | global batch size:    64 | lm loss: 7.014512E-01 | loss scale: 1.0 | grad norm: 1.015 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555def77b080] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] Missing reference picture, default is 65542
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] Missing reference picture, default is 65542
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 19:08:15] iteration      242/    1000 | consumed samples:        15488 | elapsed time per iteration (ms): 79642.5 | throughput per GPU (TFLOP/s/GPU): 96.8 | learning rate: 4.444819E-06 | global batch size:    64 | lm loss: 6.564631E-01 | loss scale: 1.0 | grad norm: 0.756 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
 [2024-11-27 19:09:39] iteration      243/    1000 | consumed samples:        15552 | elapsed time per iteration (ms): 83777.7 | throughput per GPU (TFLOP/s/GPU): 92.0 | learning rate: 4.439779E-06 | global batch size:    64 | lm loss: 7.187881E-01 | loss scale: 1.0 | grad norm: 1.296 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
 [2024-11-27 19:10:56] iteration      244/    1000 | consumed samples:        15616 | elapsed time per iteration (ms): 76634.3 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 4.434719E-06 | global batch size:    64 | lm loss: 6.608657E-01 | loss scale: 1.0 | grad norm: 0.865 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
 [2024-11-27 19:12:14] iteration      245/    1000 | consumed samples:        15680 | elapsed time per iteration (ms): 77894.3 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 4.429639E-06 | global batch size:    64 | lm loss: 6.903512E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
 [2024-11-27 19:13:50] iteration      246/    1000 | consumed samples:        15744 | elapsed time per iteration (ms): 96295.0 | throughput per GPU (TFLOP/s/GPU): 80.1 | learning rate: 4.424540E-06 | global batch size:    64 | lm loss: 6.512361E-01 | loss scale: 1.0 | grad norm: 0.934 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
 [2024-11-27 19:14:55] iteration      247/    1000 | consumed samples:        15808 | elapsed time per iteration (ms): 64734.1 | throughput per GPU (TFLOP/s/GPU): 119.1 | learning rate: 4.419421E-06 | global batch size:    64 | lm loss: 6.166883E-01 | loss scale: 1.0 | grad norm: 0.891 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-27 19:16:25] iteration      248/    1000 | consumed samples:        15872 | elapsed time per iteration (ms): 90164.6 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 4.414282E-06 | global batch size:    64 | lm loss: 6.597496E-01 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 19:18:02] iteration      249/    1000 | consumed samples:        15936 | elapsed time per iteration (ms): 96840.0 | throughput per GPU (TFLOP/s/GPU): 79.6 | learning rate: 4.409124E-06 | global batch size:    64 | lm loss: 6.686405E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
 [2024-11-27 19:19:17] iteration      250/    1000 | consumed samples:        16000 | elapsed time per iteration (ms): 74766.9 | throughput per GPU (TFLOP/s/GPU): 103.1 | learning rate: 4.403946E-06 | global batch size:    64 | lm loss: 6.316459E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
 [2024-11-27 19:20:51] iteration      251/    1000 | consumed samples:        16064 | elapsed time per iteration (ms): 94801.2 | throughput per GPU (TFLOP/s/GPU): 81.3 | learning rate: 4.398749E-06 | global batch size:    64 | lm loss: 6.754164E-01 | loss scale: 1.0 | grad norm: 1.007 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 19:22:08] iteration      252/    1000 | consumed samples:        16128 | elapsed time per iteration (ms): 76512.8 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 4.393533E-06 | global batch size:    64 | lm loss: 7.113305E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 19:23:32] iteration      253/    1000 | consumed samples:        16192 | elapsed time per iteration (ms): 84219.0 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 4.388297E-06 | global batch size:    64 | lm loss: 7.143953E-01 | loss scale: 1.0 | grad norm: 0.841 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 19:24:58] iteration      254/    1000 | consumed samples:        16256 | elapsed time per iteration (ms): 85796.8 | throughput per GPU (TFLOP/s/GPU): 89.8 | learning rate: 4.383042E-06 | global batch size:    64 | lm loss: 6.647977E-01 | loss scale: 1.0 | grad norm: 0.926 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-27 19:26:32] iteration      255/    1000 | consumed samples:        16320 | elapsed time per iteration (ms): 94433.6 | throughput per GPU (TFLOP/s/GPU): 81.6 | learning rate: 4.377767E-06 | global batch size:    64 | lm loss: 7.061955E-01 | loss scale: 1.0 | grad norm: 1.360 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 19:28:04] iteration      256/    1000 | consumed samples:        16384 | elapsed time per iteration (ms): 91966.5 | throughput per GPU (TFLOP/s/GPU): 83.8 | learning rate: 4.372474E-06 | global batch size:    64 | lm loss: 6.892250E-01 | loss scale: 1.0 | grad norm: 0.869 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 19:29:28] iteration      257/    1000 | consumed samples:        16448 | elapsed time per iteration (ms): 83262.5 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 4.367161E-06 | global batch size:    64 | lm loss: 6.549360E-01 | loss scale: 1.0 | grad norm: 0.877 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-27 19:30:52] iteration      258/    1000 | consumed samples:        16512 | elapsed time per iteration (ms): 84090.0 | throughput per GPU (TFLOP/s/GPU): 91.7 | learning rate: 4.361829E-06 | global batch size:    64 | lm loss: 6.244700E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 19:32:18] iteration      259/    1000 | consumed samples:        16576 | elapsed time per iteration (ms): 85835.7 | throughput per GPU (TFLOP/s/GPU): 89.8 | learning rate: 4.356478E-06 | global batch size:    64 | lm loss: 7.172493E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 19:33:50] iteration      260/    1000 | consumed samples:        16640 | elapsed time per iteration (ms): 92333.1 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 4.351109E-06 | global batch size:    64 | lm loss: 6.703358E-01 | loss scale: 1.0 | grad norm: 1.093 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-27 19:35:19] iteration      261/    1000 | consumed samples:        16704 | elapsed time per iteration (ms): 89282.0 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 4.345720E-06 | global batch size:    64 | lm loss: 6.450242E-01 | loss scale: 1.0 | grad norm: 1.135 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
 [2024-11-27 19:36:41] iteration      262/    1000 | consumed samples:        16768 | elapsed time per iteration (ms): 81865.5 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 4.340313E-06 | global batch size:    64 | lm loss: 6.914611E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-27 19:38:21] iteration      263/    1000 | consumed samples:        16832 | elapsed time per iteration (ms): 100170.0 | throughput per GPU (TFLOP/s/GPU): 77.0 | learning rate: 4.334886E-06 | global batch size:    64 | lm loss: 6.484630E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-27 19:39:52] iteration      264/    1000 | consumed samples:        16896 | elapsed time per iteration (ms): 90530.8 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.329441E-06 | global batch size:    64 | lm loss: 6.836088E-01 | loss scale: 1.0 | grad norm: 0.978 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 19:42:18] iteration      265/    1000 | consumed samples:        16960 | elapsed time per iteration (ms): 146045.8 | throughput per GPU (TFLOP/s/GPU): 52.8 | learning rate: 4.323978E-06 | global batch size:    64 | lm loss: 6.735452E-01 | loss scale: 1.0 | grad norm: 0.893 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-27 19:43:34] iteration      266/    1000 | consumed samples:        17024 | elapsed time per iteration (ms): 75962.2 | throughput per GPU (TFLOP/s/GPU): 101.5 | learning rate: 4.318496E-06 | global batch size:    64 | lm loss: 7.168859E-01 | loss scale: 1.0 | grad norm: 1.011 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-27 19:44:52] iteration      267/    1000 | consumed samples:        17088 | elapsed time per iteration (ms): 77862.5 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 4.312995E-06 | global batch size:    64 | lm loss: 7.042320E-01 | loss scale: 1.0 | grad norm: 0.832 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 19:46:26] iteration      268/    1000 | consumed samples:        17152 | elapsed time per iteration (ms): 94079.1 | throughput per GPU (TFLOP/s/GPU): 81.9 | learning rate: 4.307476E-06 | global batch size:    64 | lm loss: 7.036880E-01 | loss scale: 1.0 | grad norm: 0.906 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-27 19:47:47] iteration      269/    1000 | consumed samples:        17216 | elapsed time per iteration (ms): 80930.5 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 4.301938E-06 | global batch size:    64 | lm loss: 8.056663E-01 | loss scale: 1.0 | grad norm: 1.143 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
 [2024-11-27 19:49:17] iteration      270/    1000 | consumed samples:        17280 | elapsed time per iteration (ms): 90603.4 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.296382E-06 | global batch size:    64 | lm loss: 6.936717E-01 | loss scale: 1.0 | grad norm: 1.050 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 19:52:28] iteration      271/    1000 | consumed samples:        17344 | elapsed time per iteration (ms): 190195.6 | throughput per GPU (TFLOP/s/GPU): 40.5 | learning rate: 4.290807E-06 | global batch size:    64 | lm loss: 7.112800E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 19:54:11] iteration      272/    1000 | consumed samples:        17408 | elapsed time per iteration (ms): 103571.7 | throughput per GPU (TFLOP/s/GPU): 74.4 | learning rate: 4.285215E-06 | global batch size:    64 | lm loss: 7.187757E-01 | loss scale: 1.0 | grad norm: 0.988 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-27 19:55:39] iteration      273/    1000 | consumed samples:        17472 | elapsed time per iteration (ms): 88072.4 | throughput per GPU (TFLOP/s/GPU): 87.5 | learning rate: 4.279604E-06 | global batch size:    64 | lm loss: 7.316838E-01 | loss scale: 1.0 | grad norm: 1.097 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 1000 unjoint_samples 1000 joint_samples 71 [34696, 112979]
processed_samples 1000 unjoint_samples 1000 joint_samples 71 [34696, 112979]
processed_samples 1000 unjoint_samples 1000 joint_samples 71 [97008, 105349]
processed_samples 1000 unjoint_samples 1000 joint_samples 71 [97008, 105349]
processed_samples 1000 unjoint_samples 1000 joint_samples 75 [56126, 109719]
processed_samples 1000 unjoint_samples 1000 joint_samples 75 [56126, 109719]
processed_samples 1000 unjoint_samples 1000 joint_samples 64 [123627, 39784]
processed_samples 1000 unjoint_samples 1000 joint_samples 64 [123627, 39784]
processed_samples 1000 unjoint_samples 1000 joint_samples 69 [122492, 103007]
processed_samples 1000 unjoint_samples 1000 joint_samples 69 [122492, 103007]
processed_samples 1000 unjoint_samples 1000 joint_samples 66 [100663, 105124]
processed_samples 1000 unjoint_samples 1000 joint_samples 66 [100663, 105124]
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
processed_samples 1000 unjoint_samples 1000 joint_samples 63 [87351, 127977]
processed_samples 1000 unjoint_samples 1000 joint_samples 63 [87351, 127977]
processed_samples 1000 unjoint_samples 1000 joint_samples 68 [127446, 15741]
processed_samples 1000 unjoint_samples 1000 joint_samples 68 [127446, 15741]
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d957c6e140] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957c6e140] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-27 19:57:09] iteration      274/    1000 | consumed samples:        17536 | elapsed time per iteration (ms): 90003.2 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 4.273975E-06 | global batch size:    64 | lm loss: 6.670929E-01 | loss scale: 1.0 | grad norm: 1.001 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 19:58:29] iteration      275/    1000 | consumed samples:        17600 | elapsed time per iteration (ms): 79210.1 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 4.268328E-06 | global batch size:    64 | lm loss: 6.567287E-01 | loss scale: 1.0 | grad norm: 1.149 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 20:00:53] iteration      276/    1000 | consumed samples:        17664 | elapsed time per iteration (ms): 144170.2 | throughput per GPU (TFLOP/s/GPU): 53.5 | learning rate: 4.262663E-06 | global batch size:    64 | lm loss: 6.671741E-01 | loss scale: 1.0 | grad norm: 0.872 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-27 20:02:33] iteration      277/    1000 | consumed samples:        17728 | elapsed time per iteration (ms): 100716.0 | throughput per GPU (TFLOP/s/GPU): 76.5 | learning rate: 4.256980E-06 | global batch size:    64 | lm loss: 6.349506E-01 | loss scale: 1.0 | grad norm: 1.063 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 20:04:16] iteration      278/    1000 | consumed samples:        17792 | elapsed time per iteration (ms): 102385.9 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 4.251279E-06 | global batch size:    64 | lm loss: 6.847430E-01 | loss scale: 1.0 | grad norm: 2.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 20:05:50] iteration      279/    1000 | consumed samples:        17856 | elapsed time per iteration (ms): 93700.5 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 4.245560E-06 | global batch size:    64 | lm loss: 6.650225E-01 | loss scale: 1.0 | grad norm: 0.928 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 20:07:22] iteration      280/    1000 | consumed samples:        17920 | elapsed time per iteration (ms): 92883.3 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 4.239823E-06 | global batch size:    64 | lm loss: 7.579253E-01 | loss scale: 1.0 | grad norm: 1.494 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-27 20:08:44] iteration      281/    1000 | consumed samples:        17984 | elapsed time per iteration (ms): 81491.8 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 4.234069E-06 | global batch size:    64 | lm loss: 6.707730E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 20:10:12] iteration      282/    1000 | consumed samples:        18048 | elapsed time per iteration (ms): 87592.8 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 4.228297E-06 | global batch size:    64 | lm loss: 7.038647E-01 | loss scale: 1.0 | grad norm: 0.939 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-27 20:11:33] iteration      283/    1000 | consumed samples:        18112 | elapsed time per iteration (ms): 81693.7 | throughput per GPU (TFLOP/s/GPU): 94.4 | learning rate: 4.222507E-06 | global batch size:    64 | lm loss: 7.013253E-01 | loss scale: 1.0 | grad norm: 1.013 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 20:12:53] iteration      284/    1000 | consumed samples:        18176 | elapsed time per iteration (ms): 79464.8 | throughput per GPU (TFLOP/s/GPU): 97.0 | learning rate: 4.216700E-06 | global batch size:    64 | lm loss: 7.138371E-01 | loss scale: 1.0 | grad norm: 0.863 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 20:14:34] iteration      285/    1000 | consumed samples:        18240 | elapsed time per iteration (ms): 100817.1 | throughput per GPU (TFLOP/s/GPU): 76.5 | learning rate: 4.210876E-06 | global batch size:    64 | lm loss: 6.813183E-01 | loss scale: 1.0 | grad norm: 0.847 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 20:15:48] iteration      286/    1000 | consumed samples:        18304 | elapsed time per iteration (ms): 74261.4 | throughput per GPU (TFLOP/s/GPU): 103.8 | learning rate: 4.205033E-06 | global batch size:    64 | lm loss: 6.868650E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
 [2024-11-27 20:17:10] iteration      287/    1000 | consumed samples:        18368 | elapsed time per iteration (ms): 81979.8 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 4.199174E-06 | global batch size:    64 | lm loss: 6.615226E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 20:18:43] iteration      288/    1000 | consumed samples:        18432 | elapsed time per iteration (ms): 92724.0 | throughput per GPU (TFLOP/s/GPU): 83.1 | learning rate: 4.193297E-06 | global batch size:    64 | lm loss: 6.780793E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 20:20:12] iteration      289/    1000 | consumed samples:        18496 | elapsed time per iteration (ms): 89460.1 | throughput per GPU (TFLOP/s/GPU): 86.2 | learning rate: 4.187403E-06 | global batch size:    64 | lm loss: 6.989150E-01 | loss scale: 1.0 | grad norm: 0.779 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 20:21:52] iteration      290/    1000 | consumed samples:        18560 | elapsed time per iteration (ms): 99500.0 | throughput per GPU (TFLOP/s/GPU): 77.5 | learning rate: 4.181492E-06 | global batch size:    64 | lm loss: 6.454962E-01 | loss scale: 1.0 | grad norm: 0.929 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
 [2024-11-27 20:23:08] iteration      291/    1000 | consumed samples:        18624 | elapsed time per iteration (ms): 76484.1 | throughput per GPU (TFLOP/s/GPU): 100.8 | learning rate: 4.175564E-06 | global batch size:    64 | lm loss: 6.725484E-01 | loss scale: 1.0 | grad norm: 0.911 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
 [2024-11-27 20:24:16] iteration      292/    1000 | consumed samples:        18688 | elapsed time per iteration (ms): 68308.3 | throughput per GPU (TFLOP/s/GPU): 112.8 | learning rate: 4.169619E-06 | global batch size:    64 | lm loss: 7.150757E-01 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-27 20:25:37] iteration      293/    1000 | consumed samples:        18752 | elapsed time per iteration (ms): 80063.3 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 4.163656E-06 | global batch size:    64 | lm loss: 6.578319E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-27 20:27:17] iteration      294/    1000 | consumed samples:        18816 | elapsed time per iteration (ms): 100255.9 | throughput per GPU (TFLOP/s/GPU): 76.9 | learning rate: 4.157677E-06 | global batch size:    64 | lm loss: 6.832387E-01 | loss scale: 1.0 | grad norm: 0.984 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
 [2024-11-27 20:31:05] iteration      295/    1000 | consumed samples:        18880 | elapsed time per iteration (ms): 228503.4 | throughput per GPU (TFLOP/s/GPU): 33.7 | learning rate: 4.151681E-06 | global batch size:    64 | lm loss: 6.679215E-01 | loss scale: 1.0 | grad norm: 0.773 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-27 20:32:22] iteration      296/    1000 | consumed samples:        18944 | elapsed time per iteration (ms): 76937.8 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 4.145668E-06 | global batch size:    64 | lm loss: 6.361890E-01 | loss scale: 1.0 | grad norm: 1.200 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 20:33:54] iteration      297/    1000 | consumed samples:        19008 | elapsed time per iteration (ms): 92103.0 | throughput per GPU (TFLOP/s/GPU): 83.7 | learning rate: 4.139639E-06 | global batch size:    64 | lm loss: 7.190810E-01 | loss scale: 1.0 | grad norm: 1.149 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
 [2024-11-27 20:37:06] iteration      298/    1000 | consumed samples:        19072 | elapsed time per iteration (ms): 191713.3 | throughput per GPU (TFLOP/s/GPU): 40.2 | learning rate: 4.133592E-06 | global batch size:    64 | lm loss: 6.651546E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 20:38:11] iteration      299/    1000 | consumed samples:        19136 | elapsed time per iteration (ms): 64999.4 | throughput per GPU (TFLOP/s/GPU): 118.6 | learning rate: 4.127530E-06 | global batch size:    64 | lm loss: 6.572868E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
 [2024-11-27 20:39:31] iteration      300/    1000 | consumed samples:        19200 | elapsed time per iteration (ms): 79891.6 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 4.121450E-06 | global batch size:    64 | lm loss: 7.114632E-01 | loss scale: 1.0 | grad norm: 0.939 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (206621.70, 206622.09)
 [2024-11-27 20:44:44] iteration      301/    1000 | consumed samples:        19264 | elapsed time per iteration (ms): 106072.5 | throughput per GPU (TFLOP/s/GPU): 72.7 | learning rate: 4.115354E-06 | global batch size:    64 | lm loss: 7.603645E-01 | loss scale: 1.0 | grad norm: 0.897 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 20:46:07] iteration      302/    1000 | consumed samples:        19328 | elapsed time per iteration (ms): 83038.7 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 4.109242E-06 | global batch size:    64 | lm loss: 6.639850E-01 | loss scale: 1.0 | grad norm: 0.974 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95613c240] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
 [2024-11-27 20:47:19] iteration      303/    1000 | consumed samples:        19392 | elapsed time per iteration (ms): 72130.6 | throughput per GPU (TFLOP/s/GPU): 106.9 | learning rate: 4.103113E-06 | global batch size:    64 | lm loss: 6.722466E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
 [2024-11-27 20:49:15] iteration      304/    1000 | consumed samples:        19456 | elapsed time per iteration (ms): 116054.6 | throughput per GPU (TFLOP/s/GPU): 66.4 | learning rate: 4.096968E-06 | global batch size:    64 | lm loss: 6.590104E-01 | loss scale: 1.0 | grad norm: 0.740 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
 [2024-11-27 20:51:10] iteration      305/    1000 | consumed samples:        19520 | elapsed time per iteration (ms): 114899.9 | throughput per GPU (TFLOP/s/GPU): 67.1 | learning rate: 4.090807E-06 | global batch size:    64 | lm loss: 6.442765E-01 | loss scale: 1.0 | grad norm: 1.196 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 20:52:19] iteration      306/    1000 | consumed samples:        19584 | elapsed time per iteration (ms): 69105.8 | throughput per GPU (TFLOP/s/GPU): 111.5 | learning rate: 4.084630E-06 | global batch size:    64 | lm loss: 6.731645E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
 [2024-11-27 20:53:37] iteration      307/    1000 | consumed samples:        19648 | elapsed time per iteration (ms): 78406.8 | throughput per GPU (TFLOP/s/GPU): 98.3 | learning rate: 4.078436E-06 | global batch size:    64 | lm loss: 6.682714E-01 | loss scale: 1.0 | grad norm: 0.812 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 20:55:17] iteration      308/    1000 | consumed samples:        19712 | elapsed time per iteration (ms): 99738.4 | throughput per GPU (TFLOP/s/GPU): 77.3 | learning rate: 4.072227E-06 | global batch size:    64 | lm loss: 6.655051E-01 | loss scale: 1.0 | grad norm: 0.865 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 20:56:54] iteration      309/    1000 | consumed samples:        19776 | elapsed time per iteration (ms): 96419.7 | throughput per GPU (TFLOP/s/GPU): 79.9 | learning rate: 4.066001E-06 | global batch size:    64 | lm loss: 7.021908E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
 [2024-11-27 20:58:10] iteration      310/    1000 | consumed samples:        19840 | elapsed time per iteration (ms): 75939.3 | throughput per GPU (TFLOP/s/GPU): 101.5 | learning rate: 4.059760E-06 | global batch size:    64 | lm loss: 6.953746E-01 | loss scale: 1.0 | grad norm: 0.884 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 20:59:21] iteration      311/    1000 | consumed samples:        19904 | elapsed time per iteration (ms): 71031.9 | throughput per GPU (TFLOP/s/GPU): 108.5 | learning rate: 4.053503E-06 | global batch size:    64 | lm loss: 6.717713E-01 | loss scale: 1.0 | grad norm: 0.991 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
 [2024-11-27 21:00:32] iteration      312/    1000 | consumed samples:        19968 | elapsed time per iteration (ms): 71770.0 | throughput per GPU (TFLOP/s/GPU): 107.4 | learning rate: 4.047230E-06 | global batch size:    64 | lm loss: 6.943258E-01 | loss scale: 1.0 | grad norm: 0.860 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 21:02:08] iteration      313/    1000 | consumed samples:        20032 | elapsed time per iteration (ms): 95766.8 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 4.040941E-06 | global batch size:    64 | lm loss: 6.254554E-01 | loss scale: 1.0 | grad norm: 0.882 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 21:03:29] iteration      314/    1000 | consumed samples:        20096 | elapsed time per iteration (ms): 80604.1 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 4.034637E-06 | global batch size:    64 | lm loss: 6.780632E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-27 21:04:49] iteration      315/    1000 | consumed samples:        20160 | elapsed time per iteration (ms): 80301.8 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 4.028317E-06 | global batch size:    64 | lm loss: 6.939998E-01 | loss scale: 1.0 | grad norm: 1.131 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee7c5a00] mmco: unref short failure
[h264 @ 0x555dee7c5a00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee7c5a00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee7c5a00] mmco: unref short failure
[h264 @ 0x555dee7c5a00] mmco: unref short failure
 [2024-11-27 21:06:33] iteration      316/    1000 | consumed samples:        20224 | elapsed time per iteration (ms): 103987.9 | throughput per GPU (TFLOP/s/GPU): 74.1 | learning rate: 4.021981E-06 | global batch size:    64 | lm loss: 6.664628E-01 | loss scale: 1.0 | grad norm: 1.027 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec8de840] mmco: unref short failure
[h264 @ 0x555dec8de840] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dec8de840] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dec8de840] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dec8de840] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dec8de840] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 21:08:17] iteration      317/    1000 | consumed samples:        20288 | elapsed time per iteration (ms): 103536.3 | throughput per GPU (TFLOP/s/GPU): 74.5 | learning rate: 4.015630E-06 | global batch size:    64 | lm loss: 6.802962E-01 | loss scale: 1.0 | grad norm: 0.841 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
 [2024-11-27 21:09:56] iteration      318/    1000 | consumed samples:        20352 | elapsed time per iteration (ms): 98927.0 | throughput per GPU (TFLOP/s/GPU): 77.9 | learning rate: 4.009264E-06 | global batch size:    64 | lm loss: 6.827909E-01 | loss scale: 1.0 | grad norm: 1.091 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-27 21:11:27] iteration      319/    1000 | consumed samples:        20416 | elapsed time per iteration (ms): 91131.2 | throughput per GPU (TFLOP/s/GPU): 84.6 | learning rate: 4.002883E-06 | global batch size:    64 | lm loss: 7.089035E-01 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-27 21:12:55] iteration      320/    1000 | consumed samples:        20480 | elapsed time per iteration (ms): 87896.1 | throughput per GPU (TFLOP/s/GPU): 87.7 | learning rate: 3.996486E-06 | global batch size:    64 | lm loss: 7.206851E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] [h264 @ 0x555dee6ec240] mmco: unref short failure
mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
 [2024-11-27 21:14:09] iteration      321/    1000 | consumed samples:        20544 | elapsed time per iteration (ms): 74712.4 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 3.990074E-06 | global batch size:    64 | lm loss: 6.479557E-01 | loss scale: 1.0 | grad norm: 1.319 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedacf800] mmco: unref short failure
 [2024-11-27 21:15:33] iteration      322/    1000 | consumed samples:        20608 | elapsed time per iteration (ms): 83255.3 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 3.983647E-06 | global batch size:    64 | lm loss: 6.448869E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedacf800] mmco: unref short failure
[h264 @ 0x555dedacf800] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedacf800] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-27 21:16:50] iteration      323/    1000 | consumed samples:        20672 | elapsed time per iteration (ms): 77458.4 | throughput per GPU (TFLOP/s/GPU): 99.5 | learning rate: 3.977205E-06 | global batch size:    64 | lm loss: 7.210954E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dee6ec240] Missing reference picture, default is 65530
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955acf300] Missing reference picture, default is 65530
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-27 21:18:32] iteration      324/    1000 | consumed samples:        20736 | elapsed time per iteration (ms): 102000.0 | throughput per GPU (TFLOP/s/GPU): 75.6 | learning rate: 3.970748E-06 | global batch size:    64 | lm loss: 6.985208E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
 [2024-11-27 21:19:45] iteration      325/    1000 | consumed samples:        20800 | elapsed time per iteration (ms): 73014.5 | throughput per GPU (TFLOP/s/GPU): 105.6 | learning rate: 3.964276E-06 | global batch size:    64 | lm loss: 7.221389E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 21:21:08] iteration      326/    1000 | consumed samples:        20864 | elapsed time per iteration (ms): 82786.4 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 3.957789E-06 | global batch size:    64 | lm loss: 7.336783E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-27 21:22:48] iteration      327/    1000 | consumed samples:        20928 | elapsed time per iteration (ms): 99754.9 | throughput per GPU (TFLOP/s/GPU): 77.3 | learning rate: 3.951287E-06 | global batch size:    64 | lm loss: 7.123046E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-27 21:24:02] iteration      328/    1000 | consumed samples:        20992 | elapsed time per iteration (ms): 74610.5 | throughput per GPU (TFLOP/s/GPU): 103.3 | learning rate: 3.944771E-06 | global batch size:    64 | lm loss: 7.396567E-01 | loss scale: 1.0 | grad norm: 1.181 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
 [2024-11-27 21:25:33] iteration      329/    1000 | consumed samples:        21056 | elapsed time per iteration (ms): 90607.2 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 3.938240E-06 | global batch size:    64 | lm loss: 6.970116E-01 | loss scale: 1.0 | grad norm: 1.010 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
 [2024-11-27 21:26:51] iteration      330/    1000 | consumed samples:        21120 | elapsed time per iteration (ms): 77901.1 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 3.931695E-06 | global batch size:    64 | lm loss: 6.198275E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-27 21:27:59] iteration      331/    1000 | consumed samples:        21184 | elapsed time per iteration (ms): 67923.3 | throughput per GPU (TFLOP/s/GPU): 113.5 | learning rate: 3.925135E-06 | global batch size:    64 | lm loss: 7.459432E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
 [2024-11-27 21:30:51] iteration      332/    1000 | consumed samples:        21248 | elapsed time per iteration (ms): 172399.5 | throughput per GPU (TFLOP/s/GPU): 44.7 | learning rate: 3.918560E-06 | global batch size:    64 | lm loss: 6.864884E-01 | loss scale: 1.0 | grad norm: 1.143 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
 [2024-11-27 21:32:13] iteration      333/    1000 | consumed samples:        21312 | elapsed time per iteration (ms): 81667.0 | throughput per GPU (TFLOP/s/GPU): 94.4 | learning rate: 3.911972E-06 | global batch size:    64 | lm loss: 6.252856E-01 | loss scale: 1.0 | grad norm: 1.149 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
 [2024-11-27 21:33:50] iteration      334/    1000 | consumed samples:        21376 | elapsed time per iteration (ms): 97273.8 | throughput per GPU (TFLOP/s/GPU): 79.2 | learning rate: 3.905369E-06 | global batch size:    64 | lm loss: 7.187002E-01 | loss scale: 1.0 | grad norm: 1.079 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-27 21:35:21] iteration      335/    1000 | consumed samples:        21440 | elapsed time per iteration (ms): 91328.7 | throughput per GPU (TFLOP/s/GPU): 84.4 | learning rate: 3.898751E-06 | global batch size:    64 | lm loss: 6.292923E-01 | loss scale: 1.0 | grad norm: 0.950 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
 [2024-11-27 21:36:40] iteration      336/    1000 | consumed samples:        21504 | elapsed time per iteration (ms): 78764.0 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 3.892120E-06 | global batch size:    64 | lm loss: 6.587684E-01 | loss scale: 1.0 | grad norm: 1.085 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-27 21:38:02] iteration      337/    1000 | consumed samples:        21568 | elapsed time per iteration (ms): 81405.1 | throughput per GPU (TFLOP/s/GPU): 94.7 | learning rate: 3.885475E-06 | global batch size:    64 | lm loss: 6.140105E-01 | loss scale: 1.0 | grad norm: 0.754 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 21:39:21] iteration      338/    1000 | consumed samples:        21632 | elapsed time per iteration (ms): 79667.3 | throughput per GPU (TFLOP/s/GPU): 96.8 | learning rate: 3.878815E-06 | global batch size:    64 | lm loss: 6.848626E-01 | loss scale: 1.0 | grad norm: 0.956 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-27 21:40:47] iteration      339/    1000 | consumed samples:        21696 | elapsed time per iteration (ms): 85169.7 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 3.872142E-06 | global batch size:    64 | lm loss: 7.561729E-01 | loss scale: 1.0 | grad norm: 0.948 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
 [2024-11-27 21:42:38] iteration      340/    1000 | consumed samples:        21760 | elapsed time per iteration (ms): 111085.9 | throughput per GPU (TFLOP/s/GPU): 69.4 | learning rate: 3.865454E-06 | global batch size:    64 | lm loss: 7.680401E-01 | loss scale: 1.0 | grad norm: 0.944 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 21:45:50] iteration      341/    1000 | consumed samples:        21824 | elapsed time per iteration (ms): 191908.5 | throughput per GPU (TFLOP/s/GPU): 40.2 | learning rate: 3.858753E-06 | global batch size:    64 | lm loss: 6.925346E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-27 21:47:07] iteration      342/    1000 | consumed samples:        21888 | elapsed time per iteration (ms): 77534.8 | throughput per GPU (TFLOP/s/GPU): 99.4 | learning rate: 3.852039E-06 | global batch size:    64 | lm loss: 7.017056E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
 [2024-11-27 21:48:40] iteration      343/    1000 | consumed samples:        21952 | elapsed time per iteration (ms): 92606.0 | throughput per GPU (TFLOP/s/GPU): 83.2 | learning rate: 3.845310E-06 | global batch size:    64 | lm loss: 6.684653E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
 [2024-11-27 21:50:24] iteration      344/    1000 | consumed samples:        22016 | elapsed time per iteration (ms): 104328.5 | throughput per GPU (TFLOP/s/GPU): 73.9 | learning rate: 3.838568E-06 | global batch size:    64 | lm loss: 6.584975E-01 | loss scale: 1.0 | grad norm: 0.806 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 21:51:43] iteration      345/    1000 | consumed samples:        22080 | elapsed time per iteration (ms): 78557.2 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 3.831812E-06 | global batch size:    64 | lm loss: 7.187029E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
 [2024-11-27 21:52:58] iteration      346/    1000 | consumed samples:        22144 | elapsed time per iteration (ms): 75116.0 | throughput per GPU (TFLOP/s/GPU): 102.6 | learning rate: 3.825043E-06 | global batch size:    64 | lm loss: 6.967052E-01 | loss scale: 1.0 | grad norm: 1.003 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
 [2024-11-27 21:54:41] iteration      347/    1000 | consumed samples:        22208 | elapsed time per iteration (ms): 103431.0 | throughput per GPU (TFLOP/s/GPU): 74.5 | learning rate: 3.818261E-06 | global batch size:    64 | lm loss: 7.444284E-01 | loss scale: 1.0 | grad norm: 1.182 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-27 21:56:19] iteration      348/    1000 | consumed samples:        22272 | elapsed time per iteration (ms): 97685.4 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 3.811465E-06 | global batch size:    64 | lm loss: 6.478388E-01 | loss scale: 1.0 | grad norm: 0.870 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 21:57:59] iteration      349/    1000 | consumed samples:        22336 | elapsed time per iteration (ms): 100372.1 | throughput per GPU (TFLOP/s/GPU): 76.8 | learning rate: 3.804656E-06 | global batch size:    64 | lm loss: 6.880268E-01 | loss scale: 1.0 | grad norm: 0.730 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-27 22:00:34] iteration      350/    1000 | consumed samples:        22400 | elapsed time per iteration (ms): 154667.2 | throughput per GPU (TFLOP/s/GPU): 49.8 | learning rate: 3.797834E-06 | global batch size:    64 | lm loss: 7.512761E-01 | loss scale: 1.0 | grad norm: 0.927 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
 [2024-11-27 22:02:09] iteration      351/    1000 | consumed samples:        22464 | elapsed time per iteration (ms): 95160.2 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 3.790999E-06 | global batch size:    64 | lm loss: 7.407740E-01 | loss scale: 1.0 | grad norm: 0.972 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
 [2024-11-27 22:03:16] iteration      352/    1000 | consumed samples:        22528 | elapsed time per iteration (ms): 66347.4 | throughput per GPU (TFLOP/s/GPU): 116.2 | learning rate: 3.784151E-06 | global batch size:    64 | lm loss: 6.471483E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-27 22:05:04] iteration      353/    1000 | consumed samples:        22592 | elapsed time per iteration (ms): 108513.0 | throughput per GPU (TFLOP/s/GPU): 71.0 | learning rate: 3.777290E-06 | global batch size:    64 | lm loss: 6.076833E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-27 22:06:37] iteration      354/    1000 | consumed samples:        22656 | elapsed time per iteration (ms): 92546.4 | throughput per GPU (TFLOP/s/GPU): 83.3 | learning rate: 3.770416E-06 | global batch size:    64 | lm loss: 6.827998E-01 | loss scale: 1.0 | grad norm: 1.299 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
 [2024-11-27 22:08:07] iteration      355/    1000 | consumed samples:        22720 | elapsed time per iteration (ms): 90414.0 | throughput per GPU (TFLOP/s/GPU): 85.3 | learning rate: 3.763529E-06 | global batch size:    64 | lm loss: 6.579286E-01 | loss scale: 1.0 | grad norm: 1.292 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-27 22:09:37] iteration      356/    1000 | consumed samples:        22784 | elapsed time per iteration (ms): 89877.2 | throughput per GPU (TFLOP/s/GPU): 85.8 | learning rate: 3.756630E-06 | global batch size:    64 | lm loss: 7.134993E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-27 22:11:35] iteration      357/    1000 | consumed samples:        22848 | elapsed time per iteration (ms): 118027.1 | throughput per GPU (TFLOP/s/GPU): 65.3 | learning rate: 3.749717E-06 | global batch size:    64 | lm loss: 7.717846E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
 [2024-11-27 22:13:00] iteration      358/    1000 | consumed samples:        22912 | elapsed time per iteration (ms): 85382.6 | throughput per GPU (TFLOP/s/GPU): 90.3 | learning rate: 3.742793E-06 | global batch size:    64 | lm loss: 6.735170E-01 | loss scale: 1.0 | grad norm: 0.716 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 22:14:27] iteration      359/    1000 | consumed samples:        22976 | elapsed time per iteration (ms): 86401.7 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.735855E-06 | global batch size:    64 | lm loss: 6.633616E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-27 22:17:18] iteration      360/    1000 | consumed samples:        23040 | elapsed time per iteration (ms): 170996.0 | throughput per GPU (TFLOP/s/GPU): 45.1 | learning rate: 3.728906E-06 | global batch size:    64 | lm loss: 6.863374E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
 [2024-11-27 22:18:46] iteration      361/    1000 | consumed samples:        23104 | elapsed time per iteration (ms): 88048.3 | throughput per GPU (TFLOP/s/GPU): 87.5 | learning rate: 3.721943E-06 | global batch size:    64 | lm loss: 7.136717E-01 | loss scale: 1.0 | grad norm: 1.186 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 22:20:03] iteration      362/    1000 | consumed samples:        23168 | elapsed time per iteration (ms): 76992.2 | throughput per GPU (TFLOP/s/GPU): 100.1 | learning rate: 3.714969E-06 | global batch size:    64 | lm loss: 6.679822E-01 | loss scale: 1.0 | grad norm: 0.699 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 22:21:44] iteration      363/    1000 | consumed samples:        23232 | elapsed time per iteration (ms): 101637.7 | throughput per GPU (TFLOP/s/GPU): 75.8 | learning rate: 3.707982E-06 | global batch size:    64 | lm loss: 6.850898E-01 | loss scale: 1.0 | grad norm: 1.204 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 22:23:38] iteration      364/    1000 | consumed samples:        23296 | elapsed time per iteration (ms): 113918.9 | throughput per GPU (TFLOP/s/GPU): 67.7 | learning rate: 3.700984E-06 | global batch size:    64 | lm loss: 6.541436E-01 | loss scale: 1.0 | grad norm: 1.360 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df368a780] mmco: unref short failure
[h264 @ 0x555df368a780] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 22:25:22] iteration      365/    1000 | consumed samples:        23360 | elapsed time per iteration (ms): 103746.9 | throughput per GPU (TFLOP/s/GPU): 74.3 | learning rate: 3.693973E-06 | global batch size:    64 | lm loss: 6.338899E-01 | loss scale: 1.0 | grad norm: 0.910 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
 [2024-11-27 22:26:38] iteration      366/    1000 | consumed samples:        23424 | elapsed time per iteration (ms): 75846.3 | throughput per GPU (TFLOP/s/GPU): 101.6 | learning rate: 3.686950E-06 | global batch size:    64 | lm loss: 6.501561E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
 [2024-11-27 22:27:59] iteration      367/    1000 | consumed samples:        23488 | elapsed time per iteration (ms): 80842.6 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 3.679915E-06 | global batch size:    64 | lm loss: 6.532076E-01 | loss scale: 1.0 | grad norm: 0.893 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-27 22:29:14] iteration      368/    1000 | consumed samples:        23552 | elapsed time per iteration (ms): 75294.2 | throughput per GPU (TFLOP/s/GPU): 102.4 | learning rate: 3.672869E-06 | global batch size:    64 | lm loss: 7.082227E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 22:31:07] iteration      369/    1000 | consumed samples:        23616 | elapsed time per iteration (ms): 113118.2 | throughput per GPU (TFLOP/s/GPU): 68.1 | learning rate: 3.665810E-06 | global batch size:    64 | lm loss: 6.782001E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 22:32:41] iteration      370/    1000 | consumed samples:        23680 | elapsed time per iteration (ms): 93459.1 | throughput per GPU (TFLOP/s/GPU): 82.5 | learning rate: 3.658740E-06 | global batch size:    64 | lm loss: 6.857321E-01 | loss scale: 1.0 | grad norm: 0.755 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-27 22:33:58] iteration      371/    1000 | consumed samples:        23744 | elapsed time per iteration (ms): 77477.9 | throughput per GPU (TFLOP/s/GPU): 99.5 | learning rate: 3.651659E-06 | global batch size:    64 | lm loss: 6.332526E-01 | loss scale: 1.0 | grad norm: 0.824 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-27 22:36:11] iteration      372/    1000 | consumed samples:        23808 | elapsed time per iteration (ms): 133201.3 | throughput per GPU (TFLOP/s/GPU): 57.9 | learning rate: 3.644565E-06 | global batch size:    64 | lm loss: 6.694164E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 22:37:24] iteration      373/    1000 | consumed samples:        23872 | elapsed time per iteration (ms): 72420.9 | throughput per GPU (TFLOP/s/GPU): 106.4 | learning rate: 3.637460E-06 | global batch size:    64 | lm loss: 6.407461E-01 | loss scale: 1.0 | grad norm: 0.825 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 22:38:37] iteration      374/    1000 | consumed samples:        23936 | elapsed time per iteration (ms): 72701.7 | throughput per GPU (TFLOP/s/GPU): 106.0 | learning rate: 3.630344E-06 | global batch size:    64 | lm loss: 6.712391E-01 | loss scale: 1.0 | grad norm: 0.836 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-27 22:40:02] iteration      375/    1000 | consumed samples:        24000 | elapsed time per iteration (ms): 85487.1 | throughput per GPU (TFLOP/s/GPU): 90.2 | learning rate: 3.623217E-06 | global batch size:    64 | lm loss: 6.910558E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-27 22:41:12] iteration      376/    1000 | consumed samples:        24064 | elapsed time per iteration (ms): 69634.9 | throughput per GPU (TFLOP/s/GPU): 110.7 | learning rate: 3.616078E-06 | global batch size:    64 | lm loss: 8.254275E-01 | loss scale: 1.0 | grad norm: 1.206 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-27 22:42:21] iteration      377/    1000 | consumed samples:        24128 | elapsed time per iteration (ms): 69132.3 | throughput per GPU (TFLOP/s/GPU): 111.5 | learning rate: 3.608928E-06 | global batch size:    64 | lm loss: 7.217027E-01 | loss scale: 1.0 | grad norm: 0.838 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 22:44:28] iteration      378/    1000 | consumed samples:        24192 | elapsed time per iteration (ms): 127310.0 | throughput per GPU (TFLOP/s/GPU): 60.5 | learning rate: 3.601767E-06 | global batch size:    64 | lm loss: 6.989653E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-27 22:45:37] iteration      379/    1000 | consumed samples:        24256 | elapsed time per iteration (ms): 68915.3 | throughput per GPU (TFLOP/s/GPU): 111.9 | learning rate: 3.594595E-06 | global batch size:    64 | lm loss: 7.196112E-01 | loss scale: 1.0 | grad norm: 0.894 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 22:47:03] iteration      380/    1000 | consumed samples:        24320 | elapsed time per iteration (ms): 85554.5 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 3.587412E-06 | global batch size:    64 | lm loss: 7.016000E-01 | loss scale: 1.0 | grad norm: 0.764 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-27 22:48:49] iteration      381/    1000 | consumed samples:        24384 | elapsed time per iteration (ms): 106037.3 | throughput per GPU (TFLOP/s/GPU): 72.7 | learning rate: 3.580218E-06 | global batch size:    64 | lm loss: 7.071315E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-27 22:50:23] iteration      382/    1000 | consumed samples:        24448 | elapsed time per iteration (ms): 93817.4 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 3.573013E-06 | global batch size:    64 | lm loss: 6.622055E-01 | loss scale: 1.0 | grad norm: 1.015 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-27 22:51:47] iteration      383/    1000 | consumed samples:        24512 | elapsed time per iteration (ms): 84171.7 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 3.565798E-06 | global batch size:    64 | lm loss: 6.726893E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-27 22:53:28] iteration      384/    1000 | consumed samples:        24576 | elapsed time per iteration (ms): 101481.2 | throughput per GPU (TFLOP/s/GPU): 76.0 | learning rate: 3.558572E-06 | global batch size:    64 | lm loss: 6.746169E-01 | loss scale: 1.0 | grad norm: 0.820 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-27 22:54:49] iteration      385/    1000 | consumed samples:        24640 | elapsed time per iteration (ms): 80548.4 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 3.551335E-06 | global batch size:    64 | lm loss: 7.008195E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-27 22:56:18] iteration      386/    1000 | consumed samples:        24704 | elapsed time per iteration (ms): 89194.3 | throughput per GPU (TFLOP/s/GPU): 86.4 | learning rate: 3.544088E-06 | global batch size:    64 | lm loss: 6.912686E-01 | loss scale: 1.0 | grad norm: 1.071 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
 [2024-11-27 22:57:44] iteration      387/    1000 | consumed samples:        24768 | elapsed time per iteration (ms): 86242.0 | throughput per GPU (TFLOP/s/GPU): 89.4 | learning rate: 3.536830E-06 | global batch size:    64 | lm loss: 6.442069E-01 | loss scale: 1.0 | grad norm: 0.772 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-27 22:59:04] iteration      388/    1000 | consumed samples:        24832 | elapsed time per iteration (ms): 80049.2 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 3.529562E-06 | global batch size:    64 | lm loss: 6.354716E-01 | loss scale: 1.0 | grad norm: 0.705 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 23:00:24] iteration      389/    1000 | consumed samples:        24896 | elapsed time per iteration (ms): 80050.4 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 3.522284E-06 | global batch size:    64 | lm loss: 6.333457E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 23:01:53] iteration      390/    1000 | consumed samples:        24960 | elapsed time per iteration (ms): 88041.1 | throughput per GPU (TFLOP/s/GPU): 87.6 | learning rate: 3.514996E-06 | global batch size:    64 | lm loss: 7.811137E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
 [2024-11-27 23:03:19] iteration      391/    1000 | consumed samples:        25024 | elapsed time per iteration (ms): 86427.6 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.507697E-06 | global batch size:    64 | lm loss: 6.654625E-01 | loss scale: 1.0 | grad norm: 0.799 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-27 23:04:43] iteration      392/    1000 | consumed samples:        25088 | elapsed time per iteration (ms): 84323.1 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 3.500388E-06 | global batch size:    64 | lm loss: 7.324268E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 23:06:08] iteration      393/    1000 | consumed samples:        25152 | elapsed time per iteration (ms): 84520.0 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 3.493070E-06 | global batch size:    64 | lm loss: 7.075290E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-27 23:08:06] iteration      394/    1000 | consumed samples:        25216 | elapsed time per iteration (ms): 118235.3 | throughput per GPU (TFLOP/s/GPU): 65.2 | learning rate: 3.485741E-06 | global batch size:    64 | lm loss: 7.145350E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-27 23:09:20] iteration      395/    1000 | consumed samples:        25280 | elapsed time per iteration (ms): 73522.1 | throughput per GPU (TFLOP/s/GPU): 104.8 | learning rate: 3.478403E-06 | global batch size:    64 | lm loss: 6.982417E-01 | loss scale: 1.0 | grad norm: 0.774 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
 [2024-11-27 23:10:55] iteration      396/    1000 | consumed samples:        25344 | elapsed time per iteration (ms): 95138.1 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 3.471055E-06 | global batch size:    64 | lm loss: 6.405466E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-27 23:12:19] iteration      397/    1000 | consumed samples:        25408 | elapsed time per iteration (ms): 84195.5 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 3.463697E-06 | global batch size:    64 | lm loss: 6.580856E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-27 23:14:06] iteration      398/    1000 | consumed samples:        25472 | elapsed time per iteration (ms): 107101.7 | throughput per GPU (TFLOP/s/GPU): 72.0 | learning rate: 3.456330E-06 | global batch size:    64 | lm loss: 7.027028E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
 [2024-11-27 23:16:59] iteration      399/    1000 | consumed samples:        25536 | elapsed time per iteration (ms): 173130.9 | throughput per GPU (TFLOP/s/GPU): 44.5 | learning rate: 3.448953E-06 | global batch size:    64 | lm loss: 7.018545E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 23:18:21] iteration      400/    1000 | consumed samples:        25600 | elapsed time per iteration (ms): 81999.1 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 3.441567E-06 | global batch size:    64 | lm loss: 6.496854E-01 | loss scale: 1.0 | grad norm: 0.794 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (229487.34, 229487.67)
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-27 23:23:39] iteration      401/    1000 | consumed samples:        25664 | elapsed time per iteration (ms): 88544.4 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 3.434172E-06 | global batch size:    64 | lm loss: 8.111320E-01 | loss scale: 1.0 | grad norm: 1.285 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
 [2024-11-27 23:25:00] iteration      402/    1000 | consumed samples:        25728 | elapsed time per iteration (ms): 80221.6 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 3.426767E-06 | global batch size:    64 | lm loss: 6.549807E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dec6b6640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-27 23:26:11] iteration      403/    1000 | consumed samples:        25792 | elapsed time per iteration (ms): 71144.5 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 3.419353E-06 | global batch size:    64 | lm loss: 6.545295E-01 | loss scale: 1.0 | grad norm: 0.752 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 23:27:33] iteration      404/    1000 | consumed samples:        25856 | elapsed time per iteration (ms): 82690.1 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 3.411930E-06 | global batch size:    64 | lm loss: 6.961581E-01 | loss scale: 1.0 | grad norm: 0.867 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 23:29:07] iteration      405/    1000 | consumed samples:        25920 | elapsed time per iteration (ms): 93627.9 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 3.404497E-06 | global batch size:    64 | lm loss: 6.395518E-01 | loss scale: 1.0 | grad norm: 0.754 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-27 23:30:29] iteration      406/    1000 | consumed samples:        25984 | elapsed time per iteration (ms): 81863.0 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 3.397056E-06 | global batch size:    64 | lm loss: 6.411680E-01 | loss scale: 1.0 | grad norm: 1.623 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 23:31:38] iteration      407/    1000 | consumed samples:        26048 | elapsed time per iteration (ms): 69084.0 | throughput per GPU (TFLOP/s/GPU): 111.6 | learning rate: 3.389606E-06 | global batch size:    64 | lm loss: 6.546772E-01 | loss scale: 1.0 | grad norm: 0.769 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-27 23:33:10] iteration      408/    1000 | consumed samples:        26112 | elapsed time per iteration (ms): 91986.0 | throughput per GPU (TFLOP/s/GPU): 83.8 | learning rate: 3.382147E-06 | global batch size:    64 | lm loss: 6.975461E-01 | loss scale: 1.0 | grad norm: 1.242 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 23:34:37] iteration      409/    1000 | consumed samples:        26176 | elapsed time per iteration (ms): 87219.4 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 3.374680E-06 | global batch size:    64 | lm loss: 6.866903E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
processed_samples 1500 unjoint_samples 1500 joint_samples 104 [83603, 101734]
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
processed_samples 1500 unjoint_samples 1500 joint_samples 104 [83603, 101734]
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
processed_samples 1500 unjoint_samples 1500 joint_samples 99 [99965, 39100]
processed_samples 1500 unjoint_samples 1500 joint_samples 99 [99965, 39100]
processed_samples 1500 unjoint_samples 1500 joint_samples 98 [115630, 100088]
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
processed_samples 1500 unjoint_samples 1500 joint_samples 98 [115630, 100088]
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
processed_samples 1500 unjoint_samples 1500 joint_samples 96 [46295, 93229]
processed_samples 1500 unjoint_samples 1500 joint_samples 96 [46295, 93229]
 [2024-11-27 23:36:20] iteration      410/    1000 | consumed samples:        26240 | elapsed time per iteration (ms): 102666.0 | throughput per GPU (TFLOP/s/GPU): 75.1 | learning rate: 3.367203E-06 | global batch size:    64 | lm loss: 6.706418E-01 | loss scale: 1.0 | grad norm: 1.213 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 1500 unjoint_samples 1500 joint_samples 111 [110989, 83124]
processed_samples 1500 unjoint_samples 1500 joint_samples 111 [110989, 83124]
processed_samples 1500 unjoint_samples 1500 joint_samples 105 [128523, 100177]
processed_samples 1500 unjoint_samples 1500 joint_samples 105 [128523, 100177]
processed_samples 1500 unjoint_samples 1500 joint_samples 99 [113112, 123890]
processed_samples 1500 unjoint_samples 1500 joint_samples 99 [113112, 123890]
processed_samples 1500 unjoint_samples 1500 joint_samples 109 [107974, 121638]
processed_samples 1500 unjoint_samples 1500 joint_samples 109 [107974, 121638]
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 23:37:52] iteration      411/    1000 | consumed samples:        26304 | elapsed time per iteration (ms): 91989.4 | throughput per GPU (TFLOP/s/GPU): 83.8 | learning rate: 3.359719E-06 | global batch size:    64 | lm loss: 6.757123E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 23:41:09] iteration      412/    1000 | consumed samples:        26368 | elapsed time per iteration (ms): 196820.6 | throughput per GPU (TFLOP/s/GPU): 39.2 | learning rate: 3.352225E-06 | global batch size:    64 | lm loss: 6.460667E-01 | loss scale: 1.0 | grad norm: 0.960 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-27 23:43:18] iteration      413/    1000 | consumed samples:        26432 | elapsed time per iteration (ms): 129341.2 | throughput per GPU (TFLOP/s/GPU): 59.6 | learning rate: 3.344724E-06 | global batch size:    64 | lm loss: 6.627863E-01 | loss scale: 1.0 | grad norm: 0.784 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-27 23:44:34] iteration      414/    1000 | consumed samples:        26496 | elapsed time per iteration (ms): 75545.7 | throughput per GPU (TFLOP/s/GPU): 102.0 | learning rate: 3.337214E-06 | global batch size:    64 | lm loss: 6.712230E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-27 23:46:07] iteration      415/    1000 | consumed samples:        26560 | elapsed time per iteration (ms): 93195.6 | throughput per GPU (TFLOP/s/GPU): 82.7 | learning rate: 3.329695E-06 | global batch size:    64 | lm loss: 7.355238E-01 | loss scale: 1.0 | grad norm: 0.849 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9570eaac0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
 [2024-11-27 23:47:22] iteration      416/    1000 | consumed samples:        26624 | elapsed time per iteration (ms): 75505.2 | throughput per GPU (TFLOP/s/GPU): 102.1 | learning rate: 3.322169E-06 | global batch size:    64 | lm loss: 6.778505E-01 | loss scale: 1.0 | grad norm: 0.885 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 23:48:37] iteration      417/    1000 | consumed samples:        26688 | elapsed time per iteration (ms): 74429.2 | throughput per GPU (TFLOP/s/GPU): 103.6 | learning rate: 3.314634E-06 | global batch size:    64 | lm loss: 6.249003E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
 [2024-11-27 23:49:49] iteration      418/    1000 | consumed samples:        26752 | elapsed time per iteration (ms): 72617.8 | throughput per GPU (TFLOP/s/GPU): 106.2 | learning rate: 3.307092E-06 | global batch size:    64 | lm loss: 7.011807E-01 | loss scale: 1.0 | grad norm: 0.846 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
 [2024-11-27 23:51:04] iteration      419/    1000 | consumed samples:        26816 | elapsed time per iteration (ms): 74084.5 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 3.299541E-06 | global batch size:    64 | lm loss: 6.895205E-01 | loss scale: 1.0 | grad norm: 0.944 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
 [2024-11-27 23:52:18] iteration      420/    1000 | consumed samples:        26880 | elapsed time per iteration (ms): 74639.6 | throughput per GPU (TFLOP/s/GPU): 103.3 | learning rate: 3.291983E-06 | global batch size:    64 | lm loss: 6.247544E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-27 23:53:40] iteration      421/    1000 | consumed samples:        26944 | elapsed time per iteration (ms): 81462.5 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 3.284416E-06 | global batch size:    64 | lm loss: 6.611052E-01 | loss scale: 1.0 | grad norm: 0.900 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
 [2024-11-27 23:54:56] iteration      422/    1000 | consumed samples:        27008 | elapsed time per iteration (ms): 76765.4 | throughput per GPU (TFLOP/s/GPU): 100.4 | learning rate: 3.276843E-06 | global batch size:    64 | lm loss: 7.140992E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-27 23:56:17] iteration      423/    1000 | consumed samples:        27072 | elapsed time per iteration (ms): 80528.3 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 3.269261E-06 | global batch size:    64 | lm loss: 6.346519E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-27 23:57:33] iteration      424/    1000 | consumed samples:        27136 | elapsed time per iteration (ms): 76346.0 | throughput per GPU (TFLOP/s/GPU): 101.0 | learning rate: 3.261672E-06 | global batch size:    64 | lm loss: 6.726185E-01 | loss scale: 1.0 | grad norm: 0.885 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-27 23:59:47] iteration      425/    1000 | consumed samples:        27200 | elapsed time per iteration (ms): 133679.8 | throughput per GPU (TFLOP/s/GPU): 57.7 | learning rate: 3.254075E-06 | global batch size:    64 | lm loss: 7.108432E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 00:01:09] iteration      426/    1000 | consumed samples:        27264 | elapsed time per iteration (ms): 82143.2 | throughput per GPU (TFLOP/s/GPU): 93.8 | learning rate: 3.246472E-06 | global batch size:    64 | lm loss: 6.866354E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
 [2024-11-28 00:02:27] iteration      427/    1000 | consumed samples:        27328 | elapsed time per iteration (ms): 77428.6 | throughput per GPU (TFLOP/s/GPU): 99.6 | learning rate: 3.238860E-06 | global batch size:    64 | lm loss: 6.865810E-01 | loss scale: 1.0 | grad norm: 0.855 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 00:03:41] iteration      428/    1000 | consumed samples:        27392 | elapsed time per iteration (ms): 74443.4 | throughput per GPU (TFLOP/s/GPU): 103.5 | learning rate: 3.231242E-06 | global batch size:    64 | lm loss: 6.732894E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 00:05:10] iteration      429/    1000 | consumed samples:        27456 | elapsed time per iteration (ms): 88691.8 | throughput per GPU (TFLOP/s/GPU): 86.9 | learning rate: 3.223616E-06 | global batch size:    64 | lm loss: 5.977175E-01 | loss scale: 1.0 | grad norm: 0.753 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 00:06:37] iteration      430/    1000 | consumed samples:        27520 | elapsed time per iteration (ms): 87480.3 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 3.215984E-06 | global batch size:    64 | lm loss: 6.304359E-01 | loss scale: 1.0 | grad norm: 1.090 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 00:07:58] iteration      431/    1000 | consumed samples:        27584 | elapsed time per iteration (ms): 80605.4 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 3.208344E-06 | global batch size:    64 | lm loss: 6.228154E-01 | loss scale: 1.0 | grad norm: 0.874 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95eeb7b40] mmco: unref short failure
[h264 @ 0x55d95eeb7b40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95eeb7b40] mmco: unref short failure
[h264 @ 0x55d95eeb7b40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95eeb7b40] mmco: unref short failure
[h264 @ 0x55d95eeb7b40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95eeb7b40] mmco: unref short failure
 [2024-11-28 00:09:08] iteration      432/    1000 | consumed samples:        27648 | elapsed time per iteration (ms): 69948.3 | throughput per GPU (TFLOP/s/GPU): 110.2 | learning rate: 3.200697E-06 | global batch size:    64 | lm loss: 6.354905E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee8137c0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee8137c0] mmco: unref short failure
[h264 @ 0x555dee8137c0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
 [2024-11-28 00:10:45] iteration      433/    1000 | consumed samples:        27712 | elapsed time per iteration (ms): 96983.5 | throughput per GPU (TFLOP/s/GPU): 79.5 | learning rate: 3.193044E-06 | global batch size:    64 | lm loss: 6.355907E-01 | loss scale: 1.0 | grad norm: 0.977 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
 [2024-11-28 00:12:07] iteration      434/    1000 | consumed samples:        27776 | elapsed time per iteration (ms): 82614.7 | throughput per GPU (TFLOP/s/GPU): 93.3 | learning rate: 3.185384E-06 | global batch size:    64 | lm loss: 6.905267E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 00:13:50] iteration      435/    1000 | consumed samples:        27840 | elapsed time per iteration (ms): 102409.5 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 3.177717E-06 | global batch size:    64 | lm loss: 6.286079E-01 | loss scale: 1.0 | grad norm: 0.714 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
 [2024-11-28 00:15:18] iteration      436/    1000 | consumed samples:        27904 | elapsed time per iteration (ms): 88263.1 | throughput per GPU (TFLOP/s/GPU): 87.3 | learning rate: 3.170044E-06 | global batch size:    64 | lm loss: 6.589005E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-28 00:16:55] iteration      437/    1000 | consumed samples:        27968 | elapsed time per iteration (ms): 96811.4 | throughput per GPU (TFLOP/s/GPU): 79.6 | learning rate: 3.162364E-06 | global batch size:    64 | lm loss: 6.647096E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555df0bdeb40] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555df0bdeb40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 00:18:31] iteration      438/    1000 | consumed samples:        28032 | elapsed time per iteration (ms): 95986.1 | throughput per GPU (TFLOP/s/GPU): 80.3 | learning rate: 3.154678E-06 | global batch size:    64 | lm loss: 6.228602E-01 | loss scale: 1.0 | grad norm: 0.879 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 00:20:04] iteration      439/    1000 | consumed samples:        28096 | elapsed time per iteration (ms): 93509.2 | throughput per GPU (TFLOP/s/GPU): 82.4 | learning rate: 3.146985E-06 | global batch size:    64 | lm loss: 6.783546E-01 | loss scale: 1.0 | grad norm: 0.960 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
 [2024-11-28 00:22:07] iteration      440/    1000 | consumed samples:        28160 | elapsed time per iteration (ms): 122703.1 | throughput per GPU (TFLOP/s/GPU): 62.8 | learning rate: 3.139286E-06 | global batch size:    64 | lm loss: 6.260735E-01 | loss scale: 1.0 | grad norm: 0.852 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 00:23:26] iteration      441/    1000 | consumed samples:        28224 | elapsed time per iteration (ms): 78888.0 | throughput per GPU (TFLOP/s/GPU): 97.7 | learning rate: 3.131581E-06 | global batch size:    64 | lm loss: 7.180869E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
 [2024-11-28 00:24:32] iteration      442/    1000 | consumed samples:        28288 | elapsed time per iteration (ms): 65837.4 | throughput per GPU (TFLOP/s/GPU): 117.1 | learning rate: 3.123870E-06 | global batch size:    64 | lm loss: 6.873631E-01 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 00:26:03] iteration      443/    1000 | consumed samples:        28352 | elapsed time per iteration (ms): 91043.0 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 3.116153E-06 | global batch size:    64 | lm loss: 6.662303E-01 | loss scale: 1.0 | grad norm: 0.792 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
 [2024-11-28 00:27:20] iteration      444/    1000 | consumed samples:        28416 | elapsed time per iteration (ms): 76727.9 | throughput per GPU (TFLOP/s/GPU): 100.5 | learning rate: 3.108430E-06 | global batch size:    64 | lm loss: 6.865659E-01 | loss scale: 1.0 | grad norm: 0.915 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 00:28:42] iteration      445/    1000 | consumed samples:        28480 | elapsed time per iteration (ms): 82107.0 | throughput per GPU (TFLOP/s/GPU): 93.9 | learning rate: 3.100701E-06 | global batch size:    64 | lm loss: 6.737128E-01 | loss scale: 1.0 | grad norm: 1.008 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 00:30:05] iteration      446/    1000 | consumed samples:        28544 | elapsed time per iteration (ms): 83389.9 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 3.092966E-06 | global batch size:    64 | lm loss: 6.630214E-01 | loss scale: 1.0 | grad norm: 0.971 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 00:31:40] iteration      447/    1000 | consumed samples:        28608 | elapsed time per iteration (ms): 94623.0 | throughput per GPU (TFLOP/s/GPU): 81.5 | learning rate: 3.085225E-06 | global batch size:    64 | lm loss: 6.231685E-01 | loss scale: 1.0 | grad norm: 1.500 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 00:33:05] iteration      448/    1000 | consumed samples:        28672 | elapsed time per iteration (ms): 85533.0 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 3.077479E-06 | global batch size:    64 | lm loss: 6.156682E-01 | loss scale: 1.0 | grad norm: 0.758 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 00:34:31] iteration      449/    1000 | consumed samples:        28736 | elapsed time per iteration (ms): 85558.9 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 3.069728E-06 | global batch size:    64 | lm loss: 6.721120E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 00:35:53] iteration      450/    1000 | consumed samples:        28800 | elapsed time per iteration (ms): 82352.0 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 3.061971E-06 | global batch size:    64 | lm loss: 6.951571E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
 [2024-11-28 00:37:18] iteration      451/    1000 | consumed samples:        28864 | elapsed time per iteration (ms): 84925.7 | throughput per GPU (TFLOP/s/GPU): 90.8 | learning rate: 3.054208E-06 | global batch size:    64 | lm loss: 6.865417E-01 | loss scale: 1.0 | grad norm: 0.906 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 00:38:41] iteration      452/    1000 | consumed samples:        28928 | elapsed time per iteration (ms): 82996.1 | throughput per GPU (TFLOP/s/GPU): 92.9 | learning rate: 3.046440E-06 | global batch size:    64 | lm loss: 6.977232E-01 | loss scale: 1.0 | grad norm: 0.946 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
 [2024-11-28 00:39:56] iteration      453/    1000 | consumed samples:        28992 | elapsed time per iteration (ms): 74453.3 | throughput per GPU (TFLOP/s/GPU): 103.5 | learning rate: 3.038667E-06 | global batch size:    64 | lm loss: 5.958530E-01 | loss scale: 1.0 | grad norm: 0.829 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 00:41:11] iteration      454/    1000 | consumed samples:        29056 | elapsed time per iteration (ms): 75397.2 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 3.030889E-06 | global batch size:    64 | lm loss: 6.823794E-01 | loss scale: 1.0 | grad norm: 0.788 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-28 00:43:17] iteration      455/    1000 | consumed samples:        29120 | elapsed time per iteration (ms): 125556.0 | throughput per GPU (TFLOP/s/GPU): 61.4 | learning rate: 3.023106E-06 | global batch size:    64 | lm loss: 6.797455E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 00:44:48] iteration      456/    1000 | consumed samples:        29184 | elapsed time per iteration (ms): 90955.3 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 3.015318E-06 | global batch size:    64 | lm loss: 6.156820E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 00:47:40] iteration      457/    1000 | consumed samples:        29248 | elapsed time per iteration (ms): 171908.3 | throughput per GPU (TFLOP/s/GPU): 44.8 | learning rate: 3.007525E-06 | global batch size:    64 | lm loss: 7.012556E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 00:48:58] iteration      458/    1000 | consumed samples:        29312 | elapsed time per iteration (ms): 78811.4 | throughput per GPU (TFLOP/s/GPU): 97.8 | learning rate: 2.999727E-06 | global batch size:    64 | lm loss: 5.902069E-01 | loss scale: 1.0 | grad norm: 0.790 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-28 00:50:28] iteration      459/    1000 | consumed samples:        29376 | elapsed time per iteration (ms): 89652.5 | throughput per GPU (TFLOP/s/GPU): 86.0 | learning rate: 2.991925E-06 | global batch size:    64 | lm loss: 6.764930E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-28 00:51:42] iteration      460/    1000 | consumed samples:        29440 | elapsed time per iteration (ms): 73936.2 | throughput per GPU (TFLOP/s/GPU): 104.3 | learning rate: 2.984118E-06 | global batch size:    64 | lm loss: 6.732601E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-28 00:53:14] iteration      461/    1000 | consumed samples:        29504 | elapsed time per iteration (ms): 92328.7 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 2.976306E-06 | global batch size:    64 | lm loss: 7.627915E-01 | loss scale: 1.0 | grad norm: 1.039 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
 [2024-11-28 00:55:17] iteration      462/    1000 | consumed samples:        29568 | elapsed time per iteration (ms): 122529.1 | throughput per GPU (TFLOP/s/GPU): 62.9 | learning rate: 2.968490E-06 | global batch size:    64 | lm loss: 6.704935E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 00:57:22] iteration      463/    1000 | consumed samples:        29632 | elapsed time per iteration (ms): 124583.0 | throughput per GPU (TFLOP/s/GPU): 61.9 | learning rate: 2.960670E-06 | global batch size:    64 | lm loss: 7.774192E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x555dedf1d500] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-28 00:58:40] iteration      464/    1000 | consumed samples:        29696 | elapsed time per iteration (ms): 78714.4 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 2.952845E-06 | global batch size:    64 | lm loss: 6.651924E-01 | loss scale: 1.0 | grad norm: 0.995 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
 [2024-11-28 00:59:59] iteration      465/    1000 | consumed samples:        29760 | elapsed time per iteration (ms): 78413.5 | throughput per GPU (TFLOP/s/GPU): 98.3 | learning rate: 2.945016E-06 | global batch size:    64 | lm loss: 6.635730E-01 | loss scale: 1.0 | grad norm: 0.916 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 01:01:23] iteration      466/    1000 | consumed samples:        29824 | elapsed time per iteration (ms): 83878.6 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 2.937183E-06 | global batch size:    64 | lm loss: 6.822602E-01 | loss scale: 1.0 | grad norm: 0.763 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 01:03:07] iteration      467/    1000 | consumed samples:        29888 | elapsed time per iteration (ms): 104690.7 | throughput per GPU (TFLOP/s/GPU): 73.6 | learning rate: 2.929345E-06 | global batch size:    64 | lm loss: 6.986808E-01 | loss scale: 1.0 | grad norm: 1.090 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
 [2024-11-28 01:04:23] iteration      468/    1000 | consumed samples:        29952 | elapsed time per iteration (ms): 75443.1 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 2.921504E-06 | global batch size:    64 | lm loss: 6.062692E-01 | loss scale: 1.0 | grad norm: 0.860 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
 [2024-11-28 01:05:45] iteration      469/    1000 | consumed samples:        30016 | elapsed time per iteration (ms): 82693.9 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 2.913659E-06 | global batch size:    64 | lm loss: 6.660897E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
 [2024-11-28 01:08:12] iteration      470/    1000 | consumed samples:        30080 | elapsed time per iteration (ms): 147025.3 | throughput per GPU (TFLOP/s/GPU): 52.4 | learning rate: 2.905810E-06 | global batch size:    64 | lm loss: 6.328305E-01 | loss scale: 1.0 | grad norm: 0.839 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
 [2024-11-28 01:09:25] iteration      471/    1000 | consumed samples:        30144 | elapsed time per iteration (ms): 72860.1 | throughput per GPU (TFLOP/s/GPU): 105.8 | learning rate: 2.897957E-06 | global batch size:    64 | lm loss: 6.858408E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555deed21240] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
 [2024-11-28 01:11:13] iteration      472/    1000 | consumed samples:        30208 | elapsed time per iteration (ms): 107941.5 | throughput per GPU (TFLOP/s/GPU): 71.4 | learning rate: 2.890101E-06 | global batch size:    64 | lm loss: 7.221738E-01 | loss scale: 1.0 | grad norm: 1.071 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
 [2024-11-28 01:12:40] iteration      473/    1000 | consumed samples:        30272 | elapsed time per iteration (ms): 86691.0 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 2.882241E-06 | global batch size:    64 | lm loss: 6.783547E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
 [2024-11-28 01:14:19] iteration      474/    1000 | consumed samples:        30336 | elapsed time per iteration (ms): 98989.4 | throughput per GPU (TFLOP/s/GPU): 77.9 | learning rate: 2.874378E-06 | global batch size:    64 | lm loss: 6.476663E-01 | loss scale: 1.0 | grad norm: 1.062 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-28 01:15:46] iteration      475/    1000 | consumed samples:        30400 | elapsed time per iteration (ms): 86919.5 | throughput per GPU (TFLOP/s/GPU): 88.7 | learning rate: 2.866511E-06 | global batch size:    64 | lm loss: 7.588748E-01 | loss scale: 1.0 | grad norm: 1.063 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
 [2024-11-28 01:17:37] iteration      476/    1000 | consumed samples:        30464 | elapsed time per iteration (ms): 110927.6 | throughput per GPU (TFLOP/s/GPU): 69.5 | learning rate: 2.858641E-06 | global batch size:    64 | lm loss: 6.688645E-01 | loss scale: 1.0 | grad norm: 0.883 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
 [2024-11-28 01:19:10] iteration      477/    1000 | consumed samples:        30528 | elapsed time per iteration (ms): 92831.5 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 2.850767E-06 | global batch size:    64 | lm loss: 6.710600E-01 | loss scale: 1.0 | grad norm: 0.973 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-28 01:20:48] iteration      478/    1000 | consumed samples:        30592 | elapsed time per iteration (ms): 98092.4 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 2.842891E-06 | global batch size:    64 | lm loss: 5.867029E-01 | loss scale: 1.0 | grad norm: 1.056 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] [h264 @ 0x555ded679600] mmco: unref short failure
mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 01:22:14] iteration      479/    1000 | consumed samples:        30656 | elapsed time per iteration (ms): 86072.4 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 2.835011E-06 | global batch size:    64 | lm loss: 6.228137E-01 | loss scale: 1.0 | grad norm: 0.979 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
 [2024-11-28 01:23:43] iteration      480/    1000 | consumed samples:        30720 | elapsed time per iteration (ms): 88842.9 | throughput per GPU (TFLOP/s/GPU): 86.8 | learning rate: 2.827129E-06 | global batch size:    64 | lm loss: 6.328573E-01 | loss scale: 1.0 | grad norm: 0.946 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
 [2024-11-28 01:26:05] iteration      481/    1000 | consumed samples:        30784 | elapsed time per iteration (ms): 141791.4 | throughput per GPU (TFLOP/s/GPU): 54.4 | learning rate: 2.819243E-06 | global batch size:    64 | lm loss: 6.670113E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
 [2024-11-28 01:27:31] iteration      482/    1000 | consumed samples:        30848 | elapsed time per iteration (ms): 86684.9 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 2.811355E-06 | global batch size:    64 | lm loss: 7.225530E-01 | loss scale: 1.0 | grad norm: 1.018 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
 [2024-11-28 01:29:31] iteration      483/    1000 | consumed samples:        30912 | elapsed time per iteration (ms): 119464.5 | throughput per GPU (TFLOP/s/GPU): 64.5 | learning rate: 2.803464E-06 | global batch size:    64 | lm loss: 6.893146E-01 | loss scale: 1.0 | grad norm: 0.940 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-28 01:30:36] iteration      484/    1000 | consumed samples:        30976 | elapsed time per iteration (ms): 65499.5 | throughput per GPU (TFLOP/s/GPU): 117.7 | learning rate: 2.795570E-06 | global batch size:    64 | lm loss: 6.424096E-01 | loss scale: 1.0 | grad norm: 0.867 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 01:31:52] iteration      485/    1000 | consumed samples:        31040 | elapsed time per iteration (ms): 75993.8 | throughput per GPU (TFLOP/s/GPU): 101.4 | learning rate: 2.787674E-06 | global batch size:    64 | lm loss: 6.655073E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
 [2024-11-28 01:33:34] iteration      486/    1000 | consumed samples:        31104 | elapsed time per iteration (ms): 101309.3 | throughput per GPU (TFLOP/s/GPU): 76.1 | learning rate: 2.779775E-06 | global batch size:    64 | lm loss: 6.528612E-01 | loss scale: 1.0 | grad norm: 0.929 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
 [2024-11-28 01:34:47] iteration      487/    1000 | consumed samples:        31168 | elapsed time per iteration (ms): 73484.9 | throughput per GPU (TFLOP/s/GPU): 104.9 | learning rate: 2.771874E-06 | global batch size:    64 | lm loss: 6.569068E-01 | loss scale: 1.0 | grad norm: 0.904 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
 [2024-11-28 01:36:11] iteration      488/    1000 | consumed samples:        31232 | elapsed time per iteration (ms): 84370.3 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 2.763971E-06 | global batch size:    64 | lm loss: 6.942212E-01 | loss scale: 1.0 | grad norm: 0.966 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-28 01:37:28] iteration      489/    1000 | consumed samples:        31296 | elapsed time per iteration (ms): 76239.9 | throughput per GPU (TFLOP/s/GPU): 101.1 | learning rate: 2.756065E-06 | global batch size:    64 | lm loss: 6.821255E-01 | loss scale: 1.0 | grad norm: 0.919 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 01:38:35] iteration      490/    1000 | consumed samples:        31360 | elapsed time per iteration (ms): 67252.0 | throughput per GPU (TFLOP/s/GPU): 114.6 | learning rate: 2.748157E-06 | global batch size:    64 | lm loss: 6.273899E-01 | loss scale: 1.0 | grad norm: 0.730 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
 [2024-11-28 01:40:13] iteration      491/    1000 | consumed samples:        31424 | elapsed time per iteration (ms): 98216.4 | throughput per GPU (TFLOP/s/GPU): 78.5 | learning rate: 2.740247E-06 | global batch size:    64 | lm loss: 6.733717E-01 | loss scale: 1.0 | grad norm: 0.855 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 01:41:59] iteration      492/    1000 | consumed samples:        31488 | elapsed time per iteration (ms): 106125.0 | throughput per GPU (TFLOP/s/GPU): 72.6 | learning rate: 2.732335E-06 | global batch size:    64 | lm loss: 6.473522E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
 [2024-11-28 01:43:46] iteration      493/    1000 | consumed samples:        31552 | elapsed time per iteration (ms): 106629.2 | throughput per GPU (TFLOP/s/GPU): 72.3 | learning rate: 2.724421E-06 | global batch size:    64 | lm loss: 6.177192E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
 [2024-11-28 01:45:03] iteration      494/    1000 | consumed samples:        31616 | elapsed time per iteration (ms): 76563.4 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 2.716506E-06 | global batch size:    64 | lm loss: 7.289105E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-28 01:46:25] iteration      495/    1000 | consumed samples:        31680 | elapsed time per iteration (ms): 82717.2 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 2.708588E-06 | global batch size:    64 | lm loss: 6.200334E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 01:47:53] iteration      496/    1000 | consumed samples:        31744 | elapsed time per iteration (ms): 87485.8 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 2.700669E-06 | global batch size:    64 | lm loss: 6.855780E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
 [2024-11-28 01:49:06] iteration      497/    1000 | consumed samples:        31808 | elapsed time per iteration (ms): 72862.1 | throughput per GPU (TFLOP/s/GPU): 105.8 | learning rate: 2.692748E-06 | global batch size:    64 | lm loss: 6.573794E-01 | loss scale: 1.0 | grad norm: 0.970 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 01:50:14] iteration      498/    1000 | consumed samples:        31872 | elapsed time per iteration (ms): 68859.5 | throughput per GPU (TFLOP/s/GPU): 111.9 | learning rate: 2.684826E-06 | global batch size:    64 | lm loss: 7.352840E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
 [2024-11-28 01:51:31] iteration      499/    1000 | consumed samples:        31936 | elapsed time per iteration (ms): 76698.5 | throughput per GPU (TFLOP/s/GPU): 100.5 | learning rate: 2.676902E-06 | global batch size:    64 | lm loss: 6.518524E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
 [2024-11-28 01:52:58] iteration      500/    1000 | consumed samples:        32000 | elapsed time per iteration (ms): 86461.1 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 2.668977E-06 | global batch size:    64 | lm loss: 7.033284E-01 | loss scale: 1.0 | grad norm: 0.996 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (253152.30, 253152.62)
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 01:58:33] iteration      501/    1000 | consumed samples:        32064 | elapsed time per iteration (ms): 81971.3 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 2.661051E-06 | global batch size:    64 | lm loss: 6.540481E-01 | loss scale: 1.0 | grad norm: 1.066 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-28 01:59:52] iteration      502/    1000 | consumed samples:        32128 | elapsed time per iteration (ms): 79089.7 | throughput per GPU (TFLOP/s/GPU): 97.5 | learning rate: 2.653124E-06 | global batch size:    64 | lm loss: 6.702466E-01 | loss scale: 1.0 | grad norm: 0.794 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
 [2024-11-28 02:01:19] iteration      503/    1000 | consumed samples:        32192 | elapsed time per iteration (ms): 87171.6 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 2.645195E-06 | global batch size:    64 | lm loss: 6.394791E-01 | loss scale: 1.0 | grad norm: 0.770 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555deea7ef00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 02:02:54] iteration      504/    1000 | consumed samples:        32256 | elapsed time per iteration (ms): 95308.5 | throughput per GPU (TFLOP/s/GPU): 80.9 | learning rate: 2.637266E-06 | global batch size:    64 | lm loss: 7.753518E-01 | loss scale: 1.0 | grad norm: 1.224 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-28 02:04:44] iteration      505/    1000 | consumed samples:        32320 | elapsed time per iteration (ms): 109858.1 | throughput per GPU (TFLOP/s/GPU): 70.2 | learning rate: 2.629336E-06 | global batch size:    64 | lm loss: 6.935418E-01 | loss scale: 1.0 | grad norm: 0.760 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 02:07:14] iteration      506/    1000 | consumed samples:        32384 | elapsed time per iteration (ms): 149922.7 | throughput per GPU (TFLOP/s/GPU): 51.4 | learning rate: 2.621404E-06 | global batch size:    64 | lm loss: 6.945059E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:08:31] iteration      507/    1000 | consumed samples:        32448 | elapsed time per iteration (ms): 76903.3 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 2.613473E-06 | global batch size:    64 | lm loss: 6.709553E-01 | loss scale: 1.0 | grad norm: 0.908 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-28 02:11:03] iteration      508/    1000 | consumed samples:        32512 | elapsed time per iteration (ms): 152007.0 | throughput per GPU (TFLOP/s/GPU): 50.7 | learning rate: 2.605540E-06 | global batch size:    64 | lm loss: 6.527804E-01 | loss scale: 1.0 | grad norm: 1.022 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956386f00] Missing reference picture, default is 65530
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dedf05880] Missing reference picture, default is 65530
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
 [2024-11-28 02:12:14] iteration      509/    1000 | consumed samples:        32576 | elapsed time per iteration (ms): 71228.4 | throughput per GPU (TFLOP/s/GPU): 108.2 | learning rate: 2.597607E-06 | global batch size:    64 | lm loss: 6.511109E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:13:38] iteration      510/    1000 | consumed samples:        32640 | elapsed time per iteration (ms): 83181.4 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 2.589673E-06 | global batch size:    64 | lm loss: 7.232425E-01 | loss scale: 1.0 | grad norm: 1.021 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
 [2024-11-28 02:15:14] iteration      511/    1000 | consumed samples:        32704 | elapsed time per iteration (ms): 96538.9 | throughput per GPU (TFLOP/s/GPU): 79.8 | learning rate: 2.581739E-06 | global batch size:    64 | lm loss: 6.949818E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
 [2024-11-28 02:16:38] iteration      512/    1000 | consumed samples:        32768 | elapsed time per iteration (ms): 84289.3 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 2.573804E-06 | global batch size:    64 | lm loss: 6.826892E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
 [2024-11-28 02:17:53] iteration      513/    1000 | consumed samples:        32832 | elapsed time per iteration (ms): 74479.1 | throughput per GPU (TFLOP/s/GPU): 103.5 | learning rate: 2.565870E-06 | global batch size:    64 | lm loss: 6.208887E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-28 02:19:45] iteration      514/    1000 | consumed samples:        32896 | elapsed time per iteration (ms): 112413.5 | throughput per GPU (TFLOP/s/GPU): 68.6 | learning rate: 2.557935E-06 | global batch size:    64 | lm loss: 6.680191E-01 | loss scale: 1.0 | grad norm: 1.049 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-28 02:21:03] iteration      515/    1000 | consumed samples:        32960 | elapsed time per iteration (ms): 77731.3 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 2.550000E-06 | global batch size:    64 | lm loss: 6.346695E-01 | loss scale: 1.0 | grad norm: 1.071 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:22:56] iteration      516/    1000 | consumed samples:        33024 | elapsed time per iteration (ms): 112534.7 | throughput per GPU (TFLOP/s/GPU): 68.5 | learning rate: 2.542065E-06 | global batch size:    64 | lm loss: 6.696709E-01 | loss scale: 1.0 | grad norm: 1.112 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-28 02:24:38] iteration      517/    1000 | consumed samples:        33088 | elapsed time per iteration (ms): 102222.6 | throughput per GPU (TFLOP/s/GPU): 75.4 | learning rate: 2.534130E-06 | global batch size:    64 | lm loss: 6.623139E-01 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:26:02] iteration      518/    1000 | consumed samples:        33152 | elapsed time per iteration (ms): 83961.8 | throughput per GPU (TFLOP/s/GPU): 91.8 | learning rate: 2.526196E-06 | global batch size:    64 | lm loss: 6.795787E-01 | loss scale: 1.0 | grad norm: 0.961 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 02:27:54] iteration      519/    1000 | consumed samples:        33216 | elapsed time per iteration (ms): 111781.7 | throughput per GPU (TFLOP/s/GPU): 69.0 | learning rate: 2.518261E-06 | global batch size:    64 | lm loss: 6.612890E-01 | loss scale: 1.0 | grad norm: 0.910 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
 [2024-11-28 02:29:16] iteration      520/    1000 | consumed samples:        33280 | elapsed time per iteration (ms): 82821.7 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 2.510327E-06 | global batch size:    64 | lm loss: 6.640331E-01 | loss scale: 1.0 | grad norm: 0.889 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-28 02:30:56] iteration      521/    1000 | consumed samples:        33344 | elapsed time per iteration (ms): 99758.6 | throughput per GPU (TFLOP/s/GPU): 77.3 | learning rate: 2.502393E-06 | global batch size:    64 | lm loss: 7.587718E-01 | loss scale: 1.0 | grad norm: 0.721 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957eff780] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957eff780] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957eff780] mmco: unref short failure
 [2024-11-28 02:32:14] iteration      522/    1000 | consumed samples:        33408 | elapsed time per iteration (ms): 77707.6 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 2.494460E-06 | global batch size:    64 | lm loss: 6.407965E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-28 02:33:56] iteration      523/    1000 | consumed samples:        33472 | elapsed time per iteration (ms): 102303.9 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 2.486527E-06 | global batch size:    64 | lm loss: 7.260080E-01 | loss scale: 1.0 | grad norm: 1.015 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:35:12] iteration      524/    1000 | consumed samples:        33536 | elapsed time per iteration (ms): 76197.9 | throughput per GPU (TFLOP/s/GPU): 101.2 | learning rate: 2.478596E-06 | global batch size:    64 | lm loss: 6.908966E-01 | loss scale: 1.0 | grad norm: 0.916 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
 [2024-11-28 02:36:29] iteration      525/    1000 | consumed samples:        33600 | elapsed time per iteration (ms): 76456.3 | throughput per GPU (TFLOP/s/GPU): 100.8 | learning rate: 2.470664E-06 | global batch size:    64 | lm loss: 6.749615E-01 | loss scale: 1.0 | grad norm: 0.942 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
 [2024-11-28 02:38:03] iteration      526/    1000 | consumed samples:        33664 | elapsed time per iteration (ms): 93927.9 | throughput per GPU (TFLOP/s/GPU): 82.1 | learning rate: 2.462734E-06 | global batch size:    64 | lm loss: 6.362700E-01 | loss scale: 1.0 | grad norm: 0.938 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:39:42] iteration      527/    1000 | consumed samples:        33728 | elapsed time per iteration (ms): 99402.2 | throughput per GPU (TFLOP/s/GPU): 77.5 | learning rate: 2.454805E-06 | global batch size:    64 | lm loss: 6.780536E-01 | loss scale: 1.0 | grad norm: 9.398 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:41:00] iteration      528/    1000 | consumed samples:        33792 | elapsed time per iteration (ms): 77721.5 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 2.446876E-06 | global batch size:    64 | lm loss: 8.120739E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 02:42:50] iteration      529/    1000 | consumed samples:        33856 | elapsed time per iteration (ms): 110064.9 | throughput per GPU (TFLOP/s/GPU): 70.0 | learning rate: 2.438949E-06 | global batch size:    64 | lm loss: 6.335684E-01 | loss scale: 1.0 | grad norm: 0.962 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 02:44:03] iteration      530/    1000 | consumed samples:        33920 | elapsed time per iteration (ms): 72503.1 | throughput per GPU (TFLOP/s/GPU): 106.3 | learning rate: 2.431023E-06 | global batch size:    64 | lm loss: 6.750400E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 02:45:33] iteration      531/    1000 | consumed samples:        33984 | elapsed time per iteration (ms): 90134.3 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 2.423098E-06 | global batch size:    64 | lm loss: 6.134099E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
 [2024-11-28 02:47:06] iteration      532/    1000 | consumed samples:        34048 | elapsed time per iteration (ms): 92802.6 | throughput per GPU (TFLOP/s/GPU): 83.1 | learning rate: 2.415174E-06 | global batch size:    64 | lm loss: 6.257591E-01 | loss scale: 1.0 | grad norm: 0.961 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:48:30] iteration      533/    1000 | consumed samples:        34112 | elapsed time per iteration (ms): 84370.2 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 2.407252E-06 | global batch size:    64 | lm loss: 6.479869E-01 | loss scale: 1.0 | grad norm: 0.742 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-28 02:49:39] iteration      534/    1000 | consumed samples:        34176 | elapsed time per iteration (ms): 68564.5 | throughput per GPU (TFLOP/s/GPU): 112.4 | learning rate: 2.399331E-06 | global batch size:    64 | lm loss: 6.712840E-01 | loss scale: 1.0 | grad norm: 0.770 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 02:51:14] iteration      535/    1000 | consumed samples:        34240 | elapsed time per iteration (ms): 95872.4 | throughput per GPU (TFLOP/s/GPU): 80.4 | learning rate: 2.391412E-06 | global batch size:    64 | lm loss: 6.123539E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 02:52:25] iteration      536/    1000 | consumed samples:        34304 | elapsed time per iteration (ms): 70396.4 | throughput per GPU (TFLOP/s/GPU): 109.5 | learning rate: 2.383494E-06 | global batch size:    64 | lm loss: 6.391864E-01 | loss scale: 1.0 | grad norm: 0.809 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def77b080] mmco: unref short failure
[h264 @ 0x555def77b080] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def77b080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
 [2024-11-28 02:54:03] iteration      537/    1000 | consumed samples:        34368 | elapsed time per iteration (ms): 97681.3 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 2.375579E-06 | global batch size:    64 | lm loss: 6.583289E-01 | loss scale: 1.0 | grad norm: 0.947 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-28 02:55:47] iteration      538/    1000 | consumed samples:        34432 | elapsed time per iteration (ms): 104400.8 | throughput per GPU (TFLOP/s/GPU): 73.8 | learning rate: 2.367665E-06 | global batch size:    64 | lm loss: 7.243518E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-28 02:57:07] iteration      539/    1000 | consumed samples:        34496 | elapsed time per iteration (ms): 80003.5 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 2.359753E-06 | global batch size:    64 | lm loss: 6.497184E-01 | loss scale: 1.0 | grad norm: 1.207 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 02:59:06] iteration      540/    1000 | consumed samples:        34560 | elapsed time per iteration (ms): 119114.9 | throughput per GPU (TFLOP/s/GPU): 64.7 | learning rate: 2.351843E-06 | global batch size:    64 | lm loss: 6.795231E-01 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 03:00:26] iteration      541/    1000 | consumed samples:        34624 | elapsed time per iteration (ms): 80182.6 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 2.343935E-06 | global batch size:    64 | lm loss: 7.331969E-01 | loss scale: 1.0 | grad norm: 0.863 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 03:01:58] iteration      542/    1000 | consumed samples:        34688 | elapsed time per iteration (ms): 91273.9 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 2.336029E-06 | global batch size:    64 | lm loss: 7.497106E-01 | loss scale: 1.0 | grad norm: 1.812 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 03:03:11] iteration      543/    1000 | consumed samples:        34752 | elapsed time per iteration (ms): 73655.6 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 2.328126E-06 | global batch size:    64 | lm loss: 6.712408E-01 | loss scale: 1.0 | grad norm: 35.621 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-28 03:05:10] iteration      544/    1000 | consumed samples:        34816 | elapsed time per iteration (ms): 118369.6 | throughput per GPU (TFLOP/s/GPU): 65.1 | learning rate: 2.320225E-06 | global batch size:    64 | lm loss: 6.137346E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 03:08:43] iteration      545/    1000 | consumed samples:        34880 | elapsed time per iteration (ms): 213453.2 | throughput per GPU (TFLOP/s/GPU): 36.1 | learning rate: 2.312326E-06 | global batch size:    64 | lm loss: 6.905041E-01 | loss scale: 1.0 | grad norm: 2.642 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 03:10:41] iteration      546/    1000 | consumed samples:        34944 | elapsed time per iteration (ms): 118083.8 | throughput per GPU (TFLOP/s/GPU): 65.3 | learning rate: 2.304430E-06 | global batch size:    64 | lm loss: 6.795663E-01 | loss scale: 1.0 | grad norm: 0.895 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 03:13:12] iteration      547/    1000 | consumed samples:        35008 | elapsed time per iteration (ms): 151065.5 | throughput per GPU (TFLOP/s/GPU): 51.0 | learning rate: 2.296536E-06 | global batch size:    64 | lm loss: 6.740454E-01 | loss scale: 1.0 | grad norm: 0.963 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-28 03:14:56] iteration      548/    1000 | consumed samples:        35072 | elapsed time per iteration (ms): 103599.5 | throughput per GPU (TFLOP/s/GPU): 74.4 | learning rate: 2.288645E-06 | global batch size:    64 | lm loss: 6.875500E-01 | loss scale: 1.0 | grad norm: 0.919 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-28 03:17:11] iteration      549/    1000 | consumed samples:        35136 | elapsed time per iteration (ms): 135405.8 | throughput per GPU (TFLOP/s/GPU): 56.9 | learning rate: 2.280757E-06 | global batch size:    64 | lm loss: 7.654210E-01 | loss scale: 1.0 | grad norm: 1.331 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
 [2024-11-28 03:18:34] iteration      550/    1000 | consumed samples:        35200 | elapsed time per iteration (ms): 83113.3 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 2.272871E-06 | global batch size:    64 | lm loss: 7.020198E-01 | loss scale: 1.0 | grad norm: 0.947 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 03:19:54] iteration      551/    1000 | consumed samples:        35264 | elapsed time per iteration (ms): 79527.9 | throughput per GPU (TFLOP/s/GPU): 96.9 | learning rate: 2.264989E-06 | global batch size:    64 | lm loss: 7.161276E-01 | loss scale: 1.0 | grad norm: 1.478 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
 [2024-11-28 03:21:09] iteration      552/    1000 | consumed samples:        35328 | elapsed time per iteration (ms): 75060.8 | throughput per GPU (TFLOP/s/GPU): 102.7 | learning rate: 2.257109E-06 | global batch size:    64 | lm loss: 6.593223E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 03:22:35] iteration      553/    1000 | consumed samples:        35392 | elapsed time per iteration (ms): 86407.9 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 2.249233E-06 | global batch size:    64 | lm loss: 6.227847E-01 | loss scale: 1.0 | grad norm: 0.974 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-28 03:23:53] iteration      554/    1000 | consumed samples:        35456 | elapsed time per iteration (ms): 77190.3 | throughput per GPU (TFLOP/s/GPU): 99.9 | learning rate: 2.241359E-06 | global batch size:    64 | lm loss: 6.859035E-01 | loss scale: 1.0 | grad norm: 9.515 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 2000 unjoint_samples 2000 joint_samples 139 [104870, 75543]
processed_samples 2000 unjoint_samples 2000 joint_samples 134 [82659, 118848]
processed_samples 2000 unjoint_samples 2000 joint_samples 139 [104870, 75543]
processed_samples 2000 unjoint_samples 2000 joint_samples 134 [82659, 118848]
processed_samples 2000 unjoint_samples 2000 joint_samples 132 [78151, 115723]
processed_samples 2000 unjoint_samples 2000 joint_samples 132 [78151, 115723]
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
processed_samples 2000 unjoint_samples 2000 joint_samples 137 [102192, 79149]
processed_samples 2000 unjoint_samples 2000 joint_samples 137 [102192, 79149]
 [2024-11-28 03:25:10] iteration      555/    1000 | consumed samples:        35520 | elapsed time per iteration (ms): 76910.6 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 2.233489E-06 | global batch size:    64 | lm loss: 6.369921E-01 | loss scale: 1.0 | grad norm: 1.142 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 2000 unjoint_samples 2000 joint_samples 146 [111400, 83335]
processed_samples 2000 unjoint_samples 2000 joint_samples 146 [111400, 83335]
processed_samples 2000 unjoint_samples 2000 joint_samples 134 [114848, 116982]
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
processed_samples 2000 unjoint_samples 2000 joint_samples 134 [114848, 116982]
processed_samples 2000 unjoint_samples 2000 joint_samples 146 [122504, 130548]
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
processed_samples 2000 unjoint_samples 2000 joint_samples 146 [122504, 130548]
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
processed_samples 2000 unjoint_samples 2000 joint_samples 143 [38121, 93882]
processed_samples 2000 unjoint_samples 2000 joint_samples 143 [38121, 93882]
 [2024-11-28 03:26:30] iteration      556/    1000 | consumed samples:        35584 | elapsed time per iteration (ms): 80902.6 | throughput per GPU (TFLOP/s/GPU): 95.3 | learning rate: 2.225622E-06 | global batch size:    64 | lm loss: 6.364502E-01 | loss scale: 1.0 | grad norm: 12.179 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
 [2024-11-28 03:27:50] iteration      557/    1000 | consumed samples:        35648 | elapsed time per iteration (ms): 79243.5 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 2.217759E-06 | global batch size:    64 | lm loss: 6.272420E-01 | loss scale: 1.0 | grad norm: 0.966 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-28 03:29:20] iteration      558/    1000 | consumed samples:        35712 | elapsed time per iteration (ms): 90139.8 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 2.209899E-06 | global batch size:    64 | lm loss: 6.555175E-01 | loss scale: 1.0 | grad norm: 1.026 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-28 03:30:50] iteration      559/    1000 | consumed samples:        35776 | elapsed time per iteration (ms): 89959.7 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 2.202043E-06 | global batch size:    64 | lm loss: 6.715569E-01 | loss scale: 1.0 | grad norm: 0.927 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
 [2024-11-28 03:32:19] iteration      560/    1000 | consumed samples:        35840 | elapsed time per iteration (ms): 89186.0 | throughput per GPU (TFLOP/s/GPU): 86.4 | learning rate: 2.194190E-06 | global batch size:    64 | lm loss: 7.095491E-01 | loss scale: 1.0 | grad norm: 0.880 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-28 03:33:50] iteration      561/    1000 | consumed samples:        35904 | elapsed time per iteration (ms): 91058.7 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 2.186341E-06 | global batch size:    64 | lm loss: 6.814739E-01 | loss scale: 1.0 | grad norm: 1.873 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 03:35:29] iteration      562/    1000 | consumed samples:        35968 | elapsed time per iteration (ms): 99208.0 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 2.178496E-06 | global batch size:    64 | lm loss: 6.558337E-01 | loss scale: 1.0 | grad norm: 0.976 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee0da840] mmco: unref short failure
[h264 @ 0x555dee0da840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 03:37:21] iteration      563/    1000 | consumed samples:        36032 | elapsed time per iteration (ms): 111663.2 | throughput per GPU (TFLOP/s/GPU): 69.0 | learning rate: 2.170655E-06 | global batch size:    64 | lm loss: 7.521163E-01 | loss scale: 1.0 | grad norm: 0.846 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 03:38:40] iteration      564/    1000 | consumed samples:        36096 | elapsed time per iteration (ms): 78978.4 | throughput per GPU (TFLOP/s/GPU): 97.6 | learning rate: 2.162817E-06 | global batch size:    64 | lm loss: 6.506171E-01 | loss scale: 1.0 | grad norm: 0.910 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95c76fac0] mmco: unref short failure
[h264 @ 0x55d95c76fac0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95c76fac0] mmco: unref short failure
 [2024-11-28 03:39:50] iteration      565/    1000 | consumed samples:        36160 | elapsed time per iteration (ms): 70465.5 | throughput per GPU (TFLOP/s/GPU): 109.4 | learning rate: 2.154984E-06 | global batch size:    64 | lm loss: 6.828893E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 03:42:24] iteration      566/    1000 | consumed samples:        36224 | elapsed time per iteration (ms): 153198.9 | throughput per GPU (TFLOP/s/GPU): 50.3 | learning rate: 2.147155E-06 | global batch size:    64 | lm loss: 6.904967E-01 | loss scale: 1.0 | grad norm: 0.966 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-28 03:44:00] iteration      567/    1000 | consumed samples:        36288 | elapsed time per iteration (ms): 95808.9 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 2.139330E-06 | global batch size:    64 | lm loss: 6.944407E-01 | loss scale: 1.0 | grad norm: 1.575 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
 [2024-11-28 03:45:30] iteration      568/    1000 | consumed samples:        36352 | elapsed time per iteration (ms): 90076.5 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 2.131510E-06 | global batch size:    64 | lm loss: 6.663601E-01 | loss scale: 1.0 | grad norm: 0.754 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 03:47:43] iteration      569/    1000 | consumed samples:        36416 | elapsed time per iteration (ms): 133694.0 | throughput per GPU (TFLOP/s/GPU): 57.7 | learning rate: 2.123694E-06 | global batch size:    64 | lm loss: 6.749229E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
 [2024-11-28 03:49:26] iteration      570/    1000 | consumed samples:        36480 | elapsed time per iteration (ms): 103054.1 | throughput per GPU (TFLOP/s/GPU): 74.8 | learning rate: 2.115882E-06 | global batch size:    64 | lm loss: 6.709623E-01 | loss scale: 1.0 | grad norm: 0.804 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-28 03:50:43] iteration      571/    1000 | consumed samples:        36544 | elapsed time per iteration (ms): 76619.1 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 2.108075E-06 | global batch size:    64 | lm loss: 6.776207E-01 | loss scale: 1.0 | grad norm: 0.946 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
 [2024-11-28 03:52:25] iteration      572/    1000 | consumed samples:        36608 | elapsed time per iteration (ms): 101916.4 | throughput per GPU (TFLOP/s/GPU): 75.6 | learning rate: 2.100273E-06 | global batch size:    64 | lm loss: 6.679552E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 03:53:45] iteration      573/    1000 | consumed samples:        36672 | elapsed time per iteration (ms): 79866.7 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 2.092475E-06 | global batch size:    64 | lm loss: 6.936182E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
 [2024-11-28 03:55:21] iteration      574/    1000 | consumed samples:        36736 | elapsed time per iteration (ms): 96071.2 | throughput per GPU (TFLOP/s/GPU): 80.2 | learning rate: 2.084682E-06 | global batch size:    64 | lm loss: 6.289401E-01 | loss scale: 1.0 | grad norm: 1.045 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
 [2024-11-28 03:56:43] iteration      575/    1000 | consumed samples:        36800 | elapsed time per iteration (ms): 82515.1 | throughput per GPU (TFLOP/s/GPU): 93.4 | learning rate: 2.076894E-06 | global batch size:    64 | lm loss: 6.630487E-01 | loss scale: 1.0 | grad norm: 0.803 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 03:58:24] iteration      576/    1000 | consumed samples:        36864 | elapsed time per iteration (ms): 100449.7 | throughput per GPU (TFLOP/s/GPU): 76.7 | learning rate: 2.069111E-06 | global batch size:    64 | lm loss: 7.089751E-01 | loss scale: 1.0 | grad norm: 0.920 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-28 03:59:46] iteration      577/    1000 | consumed samples:        36928 | elapsed time per iteration (ms): 81737.9 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 2.061333E-06 | global batch size:    64 | lm loss: 6.779444E-01 | loss scale: 1.0 | grad norm: 0.950 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
 [2024-11-28 04:01:12] iteration      578/    1000 | consumed samples:        36992 | elapsed time per iteration (ms): 85889.6 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 2.053560E-06 | global batch size:    64 | lm loss: 6.224477E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555deee52400] Missing reference picture, default is 65530
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] Missing reference picture, default is 65530
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] Missing reference picture, default is 65530
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] Missing reference picture, default is 65530
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
 [2024-11-28 04:02:38] iteration      579/    1000 | consumed samples:        37056 | elapsed time per iteration (ms): 86490.3 | throughput per GPU (TFLOP/s/GPU): 89.1 | learning rate: 2.045792E-06 | global batch size:    64 | lm loss: 7.127038E-01 | loss scale: 1.0 | grad norm: 1.609 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 04:03:48] iteration      580/    1000 | consumed samples:        37120 | elapsed time per iteration (ms): 69500.1 | throughput per GPU (TFLOP/s/GPU): 110.9 | learning rate: 2.038029E-06 | global batch size:    64 | lm loss: 6.738371E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555de1b1b9c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedacf800] mmco: unref short failure
[h264 @ 0x555dedacf800] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 04:04:54] iteration      581/    1000 | consumed samples:        37184 | elapsed time per iteration (ms): 66004.8 | throughput per GPU (TFLOP/s/GPU): 116.8 | learning rate: 2.030272E-06 | global batch size:    64 | lm loss: 7.272795E-01 | loss scale: 1.0 | grad norm: 1.192 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-28 04:06:33] iteration      582/    1000 | consumed samples:        37248 | elapsed time per iteration (ms): 99201.2 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 2.022521E-06 | global batch size:    64 | lm loss: 7.001043E-01 | loss scale: 1.0 | grad norm: 0.864 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 04:07:49] iteration      583/    1000 | consumed samples:        37312 | elapsed time per iteration (ms): 76158.1 | throughput per GPU (TFLOP/s/GPU): 101.2 | learning rate: 2.014775E-06 | global batch size:    64 | lm loss: 6.584710E-01 | loss scale: 1.0 | grad norm: 0.949 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 04:09:29] iteration      584/    1000 | consumed samples:        37376 | elapsed time per iteration (ms): 100293.9 | throughput per GPU (TFLOP/s/GPU): 76.9 | learning rate: 2.007034E-06 | global batch size:    64 | lm loss: 7.019646E-01 | loss scale: 1.0 | grad norm: 1.057 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deda7a5c0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
 [2024-11-28 04:11:02] iteration      585/    1000 | consumed samples:        37440 | elapsed time per iteration (ms): 92320.3 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 1.999299E-06 | global batch size:    64 | lm loss: 6.701733E-01 | loss scale: 1.0 | grad norm: 1.212 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 04:12:17] iteration      586/    1000 | consumed samples:        37504 | elapsed time per iteration (ms): 75791.0 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 1.991570E-06 | global batch size:    64 | lm loss: 6.158227E-01 | loss scale: 1.0 | grad norm: 0.814 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 04:14:01] iteration      587/    1000 | consumed samples:        37568 | elapsed time per iteration (ms): 103921.2 | throughput per GPU (TFLOP/s/GPU): 74.2 | learning rate: 1.983847E-06 | global batch size:    64 | lm loss: 7.956023E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-28 04:15:38] iteration      588/    1000 | consumed samples:        37632 | elapsed time per iteration (ms): 96704.1 | throughput per GPU (TFLOP/s/GPU): 79.7 | learning rate: 1.976130E-06 | global batch size:    64 | lm loss: 6.003830E-01 | loss scale: 1.0 | grad norm: 0.940 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955e77900] mmco: unref short failure
[h264 @ 0x55d955e77900] mmco: unref short failure
 [2024-11-28 04:16:57] iteration      589/    1000 | consumed samples:        37696 | elapsed time per iteration (ms): 78610.3 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 1.968419E-06 | global batch size:    64 | lm loss: 6.652268E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e77900] mmco: unref short failure
[h264 @ 0x55d955e77900] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 04:18:16] iteration      590/    1000 | consumed samples:        37760 | elapsed time per iteration (ms): 79304.7 | throughput per GPU (TFLOP/s/GPU): 97.2 | learning rate: 1.960714E-06 | global batch size:    64 | lm loss: 6.871901E-01 | loss scale: 1.0 | grad norm: 0.875 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-28 04:19:46] iteration      591/    1000 | consumed samples:        37824 | elapsed time per iteration (ms): 89563.2 | throughput per GPU (TFLOP/s/GPU): 86.1 | learning rate: 1.953015E-06 | global batch size:    64 | lm loss: 6.808087E-01 | loss scale: 1.0 | grad norm: 0.825 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
 [2024-11-28 04:21:19] iteration      592/    1000 | consumed samples:        37888 | elapsed time per iteration (ms): 93821.8 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 1.945322E-06 | global batch size:    64 | lm loss: 6.263063E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-28 04:22:30] iteration      593/    1000 | consumed samples:        37952 | elapsed time per iteration (ms): 70646.2 | throughput per GPU (TFLOP/s/GPU): 109.1 | learning rate: 1.937636E-06 | global batch size:    64 | lm loss: 6.241408E-01 | loss scale: 1.0 | grad norm: 0.808 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 04:23:56] iteration      594/    1000 | consumed samples:        38016 | elapsed time per iteration (ms): 85648.4 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 1.929956E-06 | global batch size:    64 | lm loss: 7.639532E-01 | loss scale: 1.0 | grad norm: 0.980 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 04:25:13] iteration      595/    1000 | consumed samples:        38080 | elapsed time per iteration (ms): 76932.8 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 1.922283E-06 | global batch size:    64 | lm loss: 6.555119E-01 | loss scale: 1.0 | grad norm: 1.175 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-28 04:26:52] iteration      596/    1000 | consumed samples:        38144 | elapsed time per iteration (ms): 99139.1 | throughput per GPU (TFLOP/s/GPU): 77.8 | learning rate: 1.914616E-06 | global batch size:    64 | lm loss: 6.388863E-01 | loss scale: 1.0 | grad norm: 0.967 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
 [2024-11-28 04:28:04] iteration      597/    1000 | consumed samples:        38208 | elapsed time per iteration (ms): 72491.0 | throughput per GPU (TFLOP/s/GPU): 106.3 | learning rate: 1.906956E-06 | global batch size:    64 | lm loss: 6.704025E-01 | loss scale: 1.0 | grad norm: 1.546 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-28 04:29:25] iteration      598/    1000 | consumed samples:        38272 | elapsed time per iteration (ms): 80478.1 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 1.899303E-06 | global batch size:    64 | lm loss: 6.136689E-01 | loss scale: 1.0 | grad norm: 1.109 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
 [2024-11-28 04:31:01] iteration      599/    1000 | consumed samples:        38336 | elapsed time per iteration (ms): 96696.7 | throughput per GPU (TFLOP/s/GPU): 79.7 | learning rate: 1.891656E-06 | global batch size:    64 | lm loss: 6.841081E-01 | loss scale: 1.0 | grad norm: 0.911 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 04:32:13] iteration      600/    1000 | consumed samples:        38400 | elapsed time per iteration (ms): 71923.7 | throughput per GPU (TFLOP/s/GPU): 107.2 | learning rate: 1.884016E-06 | global batch size:    64 | lm loss: 6.775546E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (145404.08, 145404.33)
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
 [2024-11-28 04:35:55] iteration      601/    1000 | consumed samples:        38464 | elapsed time per iteration (ms): 76132.0 | throughput per GPU (TFLOP/s/GPU): 101.3 | learning rate: 1.876384E-06 | global batch size:    64 | lm loss: 6.671367E-01 | loss scale: 1.0 | grad norm: 1.008 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-28 04:37:59] iteration      602/    1000 | consumed samples:        38528 | elapsed time per iteration (ms): 124385.1 | throughput per GPU (TFLOP/s/GPU): 62.0 | learning rate: 1.868758E-06 | global batch size:    64 | lm loss: 6.553986E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 04:39:27] iteration      603/    1000 | consumed samples:        38592 | elapsed time per iteration (ms): 87787.7 | throughput per GPU (TFLOP/s/GPU): 87.8 | learning rate: 1.861140E-06 | global batch size:    64 | lm loss: 6.798328E-01 | loss scale: 1.0 | grad norm: 0.921 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-28 04:41:44] iteration      604/    1000 | consumed samples:        38656 | elapsed time per iteration (ms): 137126.1 | throughput per GPU (TFLOP/s/GPU): 56.2 | learning rate: 1.853528E-06 | global batch size:    64 | lm loss: 6.614301E-01 | loss scale: 1.0 | grad norm: 0.804 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 04:43:04] iteration      605/    1000 | consumed samples:        38720 | elapsed time per iteration (ms): 80051.9 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 1.845925E-06 | global batch size:    64 | lm loss: 6.650500E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
 [2024-11-28 04:44:41] iteration      606/    1000 | consumed samples:        38784 | elapsed time per iteration (ms): 96758.0 | throughput per GPU (TFLOP/s/GPU): 79.7 | learning rate: 1.838328E-06 | global batch size:    64 | lm loss: 6.374074E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 04:45:56] iteration      607/    1000 | consumed samples:        38848 | elapsed time per iteration (ms): 74822.7 | throughput per GPU (TFLOP/s/GPU): 103.0 | learning rate: 1.830739E-06 | global batch size:    64 | lm loss: 6.324731E-01 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
 [2024-11-28 04:47:20] iteration      608/    1000 | consumed samples:        38912 | elapsed time per iteration (ms): 83983.6 | throughput per GPU (TFLOP/s/GPU): 91.8 | learning rate: 1.823157E-06 | global batch size:    64 | lm loss: 7.577994E-01 | loss scale: 1.0 | grad norm: 0.919 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 04:48:36] iteration      609/    1000 | consumed samples:        38976 | elapsed time per iteration (ms): 75702.5 | throughput per GPU (TFLOP/s/GPU): 101.8 | learning rate: 1.815584E-06 | global batch size:    64 | lm loss: 7.271475E-01 | loss scale: 1.0 | grad norm: 0.900 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
 [2024-11-28 04:49:48] iteration      610/    1000 | consumed samples:        39040 | elapsed time per iteration (ms): 71831.5 | throughput per GPU (TFLOP/s/GPU): 107.3 | learning rate: 1.808017E-06 | global batch size:    64 | lm loss: 7.437881E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] Missing reference picture, default is 65540
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] Missing reference picture, default is 65540
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] Missing reference picture, default is 65540
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] Missing reference picture, default is 65540
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-28 04:51:43] iteration      611/    1000 | consumed samples:        39104 | elapsed time per iteration (ms): 115234.0 | throughput per GPU (TFLOP/s/GPU): 66.9 | learning rate: 1.800459E-06 | global batch size:    64 | lm loss: 6.608888E-01 | loss scale: 1.0 | grad norm: 0.997 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 04:52:55] iteration      612/    1000 | consumed samples:        39168 | elapsed time per iteration (ms): 72263.2 | throughput per GPU (TFLOP/s/GPU): 106.7 | learning rate: 1.792908E-06 | global batch size:    64 | lm loss: 6.323757E-01 | loss scale: 1.0 | grad norm: 0.876 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 04:54:19] iteration      613/    1000 | consumed samples:        39232 | elapsed time per iteration (ms): 84413.8 | throughput per GPU (TFLOP/s/GPU): 91.3 | learning rate: 1.785366E-06 | global batch size:    64 | lm loss: 6.873047E-01 | loss scale: 1.0 | grad norm: 1.022 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
 [2024-11-28 04:56:25] iteration      614/    1000 | consumed samples:        39296 | elapsed time per iteration (ms): 125544.5 | throughput per GPU (TFLOP/s/GPU): 61.4 | learning rate: 1.777831E-06 | global batch size:    64 | lm loss: 6.434957E-01 | loss scale: 1.0 | grad norm: 0.849 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 04:57:56] iteration      615/    1000 | consumed samples:        39360 | elapsed time per iteration (ms): 91010.5 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 1.770305E-06 | global batch size:    64 | lm loss: 6.286437E-01 | loss scale: 1.0 | grad norm: 0.812 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-28 04:59:23] iteration      616/    1000 | consumed samples:        39424 | elapsed time per iteration (ms): 87353.2 | throughput per GPU (TFLOP/s/GPU): 88.2 | learning rate: 1.762786E-06 | global batch size:    64 | lm loss: 6.583841E-01 | loss scale: 1.0 | grad norm: 0.871 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 05:00:51] iteration      617/    1000 | consumed samples:        39488 | elapsed time per iteration (ms): 87235.7 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 1.755276E-06 | global batch size:    64 | lm loss: 6.181954E-01 | loss scale: 1.0 | grad norm: 0.806 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-28 05:02:07] iteration      618/    1000 | consumed samples:        39552 | elapsed time per iteration (ms): 75907.6 | throughput per GPU (TFLOP/s/GPU): 101.6 | learning rate: 1.747775E-06 | global batch size:    64 | lm loss: 6.557152E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
 [2024-11-28 05:03:45] iteration      619/    1000 | consumed samples:        39616 | elapsed time per iteration (ms): 98071.4 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 1.740281E-06 | global batch size:    64 | lm loss: 7.434057E-01 | loss scale: 1.0 | grad norm: 1.612 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
 [2024-11-28 05:05:04] iteration      620/    1000 | consumed samples:        39680 | elapsed time per iteration (ms): 78880.8 | throughput per GPU (TFLOP/s/GPU): 97.7 | learning rate: 1.732797E-06 | global batch size:    64 | lm loss: 6.678324E-01 | loss scale: 1.0 | grad norm: 1.186 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 05:06:53] iteration      621/    1000 | consumed samples:        39744 | elapsed time per iteration (ms): 109711.4 | throughput per GPU (TFLOP/s/GPU): 70.3 | learning rate: 1.725320E-06 | global batch size:    64 | lm loss: 6.870947E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 05:08:28] iteration      622/    1000 | consumed samples:        39808 | elapsed time per iteration (ms): 94990.9 | throughput per GPU (TFLOP/s/GPU): 81.2 | learning rate: 1.717853E-06 | global batch size:    64 | lm loss: 6.423949E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 05:10:16] iteration      623/    1000 | consumed samples:        39872 | elapsed time per iteration (ms): 107279.7 | throughput per GPU (TFLOP/s/GPU): 71.9 | learning rate: 1.710394E-06 | global batch size:    64 | lm loss: 6.864921E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 05:11:34] iteration      624/    1000 | consumed samples:        39936 | elapsed time per iteration (ms): 78100.2 | throughput per GPU (TFLOP/s/GPU): 98.7 | learning rate: 1.702944E-06 | global batch size:    64 | lm loss: 6.161958E-01 | loss scale: 1.0 | grad norm: 0.932 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 05:12:52] iteration      625/    1000 | consumed samples:        40000 | elapsed time per iteration (ms): 78301.6 | throughput per GPU (TFLOP/s/GPU): 98.4 | learning rate: 1.695503E-06 | global batch size:    64 | lm loss: 6.460192E-01 | loss scale: 1.0 | grad norm: 2.826 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 05:14:10] iteration      626/    1000 | consumed samples:        40064 | elapsed time per iteration (ms): 78201.6 | throughput per GPU (TFLOP/s/GPU): 98.6 | learning rate: 1.688070E-06 | global batch size:    64 | lm loss: 6.443557E-01 | loss scale: 1.0 | grad norm: 0.823 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 05:15:44] iteration      627/    1000 | consumed samples:        40128 | elapsed time per iteration (ms): 93670.8 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 1.680647E-06 | global batch size:    64 | lm loss: 6.387815E-01 | loss scale: 1.0 | grad norm: 0.940 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x555dee948bc0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 05:17:20] iteration      628/    1000 | consumed samples:        40192 | elapsed time per iteration (ms): 96292.0 | throughput per GPU (TFLOP/s/GPU): 80.1 | learning rate: 1.673233E-06 | global batch size:    64 | lm loss: 6.631010E-01 | loss scale: 1.0 | grad norm: 0.909 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 05:18:48] iteration      629/    1000 | consumed samples:        40256 | elapsed time per iteration (ms): 88126.4 | throughput per GPU (TFLOP/s/GPU): 87.5 | learning rate: 1.665828E-06 | global batch size:    64 | lm loss: 6.977863E-01 | loss scale: 1.0 | grad norm: 1.025 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 05:20:05] iteration      630/    1000 | consumed samples:        40320 | elapsed time per iteration (ms): 76528.1 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 1.658433E-06 | global batch size:    64 | lm loss: 6.857378E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 05:21:10] iteration      631/    1000 | consumed samples:        40384 | elapsed time per iteration (ms): 65241.8 | throughput per GPU (TFLOP/s/GPU): 118.2 | learning rate: 1.651047E-06 | global batch size:    64 | lm loss: 6.398815E-01 | loss scale: 1.0 | grad norm: 1.143 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 05:22:28] iteration      632/    1000 | consumed samples:        40448 | elapsed time per iteration (ms): 77900.5 | throughput per GPU (TFLOP/s/GPU): 99.0 | learning rate: 1.643670E-06 | global batch size:    64 | lm loss: 6.859981E-01 | loss scale: 1.0 | grad norm: 0.871 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 05:23:53] iteration      633/    1000 | consumed samples:        40512 | elapsed time per iteration (ms): 84697.6 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 1.636303E-06 | global batch size:    64 | lm loss: 8.127314E-01 | loss scale: 1.0 | grad norm: 0.874 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
 [2024-11-28 05:25:42] iteration      634/    1000 | consumed samples:        40576 | elapsed time per iteration (ms): 109558.1 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 1.628945E-06 | global batch size:    64 | lm loss: 7.018932E-01 | loss scale: 1.0 | grad norm: 1.332 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
 [2024-11-28 05:27:01] iteration      635/    1000 | consumed samples:        40640 | elapsed time per iteration (ms): 78317.2 | throughput per GPU (TFLOP/s/GPU): 98.4 | learning rate: 1.621597E-06 | global batch size:    64 | lm loss: 6.609097E-01 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 05:28:21] iteration      636/    1000 | consumed samples:        40704 | elapsed time per iteration (ms): 79989.3 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 1.614259E-06 | global batch size:    64 | lm loss: 6.400303E-01 | loss scale: 1.0 | grad norm: 0.997 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-28 05:29:30] iteration      637/    1000 | consumed samples:        40768 | elapsed time per iteration (ms): 69397.2 | throughput per GPU (TFLOP/s/GPU): 111.1 | learning rate: 1.606930E-06 | global batch size:    64 | lm loss: 6.713878E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 05:31:01] iteration      638/    1000 | consumed samples:        40832 | elapsed time per iteration (ms): 90607.7 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 1.599612E-06 | global batch size:    64 | lm loss: 6.384090E-01 | loss scale: 1.0 | grad norm: 0.782 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d956b52f40] mmco: unref short failure
[h264 @ 0x55d956b52f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 05:32:23] iteration      639/    1000 | consumed samples:        40896 | elapsed time per iteration (ms): 82721.4 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 1.592303E-06 | global batch size:    64 | lm loss: 6.337037E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 05:33:53] iteration      640/    1000 | consumed samples:        40960 | elapsed time per iteration (ms): 89577.6 | throughput per GPU (TFLOP/s/GPU): 86.1 | learning rate: 1.585004E-06 | global batch size:    64 | lm loss: 6.542257E-01 | loss scale: 1.0 | grad norm: 0.906 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec64f880] mmco: unref short failure
[h264 @ 0x555dec64f880] mmco: unref short failure
[h264 @ 0x555dec64f880] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 05:35:43] iteration      641/    1000 | consumed samples:        41024 | elapsed time per iteration (ms): 110295.5 | throughput per GPU (TFLOP/s/GPU): 69.9 | learning rate: 1.577716E-06 | global batch size:    64 | lm loss: 6.256745E-01 | loss scale: 1.0 | grad norm: 0.804 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 05:37:04] iteration      642/    1000 | consumed samples:        41088 | elapsed time per iteration (ms): 80229.3 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 1.570438E-06 | global batch size:    64 | lm loss: 7.508308E-01 | loss scale: 1.0 | grad norm: 1.025 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
 [2024-11-28 05:38:14] iteration      643/    1000 | consumed samples:        41152 | elapsed time per iteration (ms): 70229.7 | throughput per GPU (TFLOP/s/GPU): 109.8 | learning rate: 1.563170E-06 | global batch size:    64 | lm loss: 6.440120E-01 | loss scale: 1.0 | grad norm: 1.269 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
 [2024-11-28 05:39:52] iteration      644/    1000 | consumed samples:        41216 | elapsed time per iteration (ms): 98058.6 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 1.555912E-06 | global batch size:    64 | lm loss: 7.841610E-01 | loss scale: 1.0 | grad norm: 1.065 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
 [2024-11-28 05:41:15] iteration      645/    1000 | consumed samples:        41280 | elapsed time per iteration (ms): 82685.7 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 1.548665E-06 | global batch size:    64 | lm loss: 6.619388E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-28 05:42:54] iteration      646/    1000 | consumed samples:        41344 | elapsed time per iteration (ms): 99243.9 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 1.541428E-06 | global batch size:    64 | lm loss: 6.424022E-01 | loss scale: 1.0 | grad norm: 1.142 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 05:44:18] iteration      647/    1000 | consumed samples:        41408 | elapsed time per iteration (ms): 84013.8 | throughput per GPU (TFLOP/s/GPU): 91.8 | learning rate: 1.534202E-06 | global batch size:    64 | lm loss: 6.809196E-01 | loss scale: 1.0 | grad norm: 0.886 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-28 05:45:31] iteration      648/    1000 | consumed samples:        41472 | elapsed time per iteration (ms): 73201.4 | throughput per GPU (TFLOP/s/GPU): 105.3 | learning rate: 1.526987E-06 | global batch size:    64 | lm loss: 6.052883E-01 | loss scale: 1.0 | grad norm: 1.352 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-28 05:48:26] iteration      649/    1000 | consumed samples:        41536 | elapsed time per iteration (ms): 174738.1 | throughput per GPU (TFLOP/s/GPU): 44.1 | learning rate: 1.519782E-06 | global batch size:    64 | lm loss: 6.482288E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 05:49:37] iteration      650/    1000 | consumed samples:        41600 | elapsed time per iteration (ms): 71547.4 | throughput per GPU (TFLOP/s/GPU): 107.7 | learning rate: 1.512588E-06 | global batch size:    64 | lm loss: 7.025335E-01 | loss scale: 1.0 | grad norm: 0.772 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 05:51:05] iteration      651/    1000 | consumed samples:        41664 | elapsed time per iteration (ms): 87819.5 | throughput per GPU (TFLOP/s/GPU): 87.8 | learning rate: 1.505405E-06 | global batch size:    64 | lm loss: 6.501545E-01 | loss scale: 1.0 | grad norm: 1.009 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 05:52:25] iteration      652/    1000 | consumed samples:        41728 | elapsed time per iteration (ms): 80258.0 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 1.498233E-06 | global batch size:    64 | lm loss: 6.332477E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 05:53:44] iteration      653/    1000 | consumed samples:        41792 | elapsed time per iteration (ms): 78288.0 | throughput per GPU (TFLOP/s/GPU): 98.5 | learning rate: 1.491072E-06 | global batch size:    64 | lm loss: 6.316616E-01 | loss scale: 1.0 | grad norm: 1.037 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 05:54:57] iteration      654/    1000 | consumed samples:        41856 | elapsed time per iteration (ms): 73018.5 | throughput per GPU (TFLOP/s/GPU): 105.6 | learning rate: 1.483922E-06 | global batch size:    64 | lm loss: 6.351939E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 05:56:27] iteration      655/    1000 | consumed samples:        41920 | elapsed time per iteration (ms): 90523.6 | throughput per GPU (TFLOP/s/GPU): 85.2 | learning rate: 1.476783E-06 | global batch size:    64 | lm loss: 7.357122E-01 | loss scale: 1.0 | grad norm: 0.958 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 05:58:09] iteration      656/    1000 | consumed samples:        41984 | elapsed time per iteration (ms): 101719.0 | throughput per GPU (TFLOP/s/GPU): 75.8 | learning rate: 1.469656E-06 | global batch size:    64 | lm loss: 6.373831E-01 | loss scale: 1.0 | grad norm: 0.758 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9566796c0] mmco: unref short failure
[h264 @ 0x55d9566796c0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
 [2024-11-28 05:59:31] iteration      657/    1000 | consumed samples:        42048 | elapsed time per iteration (ms): 81789.4 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 1.462540E-06 | global batch size:    64 | lm loss: 7.237002E-01 | loss scale: 1.0 | grad norm: 1.506 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 06:00:56] iteration      658/    1000 | consumed samples:        42112 | elapsed time per iteration (ms): 85319.2 | throughput per GPU (TFLOP/s/GPU): 90.3 | learning rate: 1.455435E-06 | global batch size:    64 | lm loss: 8.265692E-01 | loss scale: 1.0 | grad norm: 0.875 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-28 06:02:07] iteration      659/    1000 | consumed samples:        42176 | elapsed time per iteration (ms): 70835.9 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 1.448341E-06 | global batch size:    64 | lm loss: 6.352268E-01 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
 [2024-11-28 06:03:38] iteration      660/    1000 | consumed samples:        42240 | elapsed time per iteration (ms): 90985.5 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 1.441260E-06 | global batch size:    64 | lm loss: 6.936147E-01 | loss scale: 1.0 | grad norm: 1.083 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f97780] mmco: unref short failure
[h264 @ 0x55d957f97780] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f97780] mmco: unref short failure
[h264 @ 0x55d957f97780] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f97780] mmco: unref short failure
[h264 @ 0x55d957f97780] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
 [2024-11-28 06:05:10] iteration      661/    1000 | consumed samples:        42304 | elapsed time per iteration (ms): 91563.5 | throughput per GPU (TFLOP/s/GPU): 84.2 | learning rate: 1.434190E-06 | global batch size:    64 | lm loss: 6.521118E-01 | loss scale: 1.0 | grad norm: 0.942 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 06:06:19] iteration      662/    1000 | consumed samples:        42368 | elapsed time per iteration (ms): 69403.8 | throughput per GPU (TFLOP/s/GPU): 111.1 | learning rate: 1.427131E-06 | global batch size:    64 | lm loss: 6.527104E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d957f98640] mmco: unref short failure
[h264 @ 0x55d957f98640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-28 06:07:55] iteration      663/    1000 | consumed samples:        42432 | elapsed time per iteration (ms): 95706.3 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 1.420085E-06 | global batch size:    64 | lm loss: 6.427971E-01 | loss scale: 1.0 | grad norm: 0.742 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deebbd8c0] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-28 06:10:29] iteration      664/    1000 | consumed samples:        42496 | elapsed time per iteration (ms): 154602.0 | throughput per GPU (TFLOP/s/GPU): 49.9 | learning rate: 1.413050E-06 | global batch size:    64 | lm loss: 6.893479E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-28 06:12:11] iteration      665/    1000 | consumed samples:        42560 | elapsed time per iteration (ms): 101420.4 | throughput per GPU (TFLOP/s/GPU): 76.0 | learning rate: 1.406027E-06 | global batch size:    64 | lm loss: 7.379445E-01 | loss scale: 1.0 | grad norm: 0.808 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-28 06:13:32] iteration      666/    1000 | consumed samples:        42624 | elapsed time per iteration (ms): 81326.1 | throughput per GPU (TFLOP/s/GPU): 94.8 | learning rate: 1.399016E-06 | global batch size:    64 | lm loss: 6.550295E-01 | loss scale: 1.0 | grad norm: 0.836 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
 [2024-11-28 06:15:36] iteration      667/    1000 | consumed samples:        42688 | elapsed time per iteration (ms): 123552.3 | throughput per GPU (TFLOP/s/GPU): 62.4 | learning rate: 1.392018E-06 | global batch size:    64 | lm loss: 7.040758E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-28 06:17:52] iteration      668/    1000 | consumed samples:        42752 | elapsed time per iteration (ms): 136390.8 | throughput per GPU (TFLOP/s/GPU): 56.5 | learning rate: 1.385031E-06 | global batch size:    64 | lm loss: 6.094788E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
 [2024-11-28 06:19:23] iteration      669/    1000 | consumed samples:        42816 | elapsed time per iteration (ms): 90558.5 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 1.378057E-06 | global batch size:    64 | lm loss: 6.080279E-01 | loss scale: 1.0 | grad norm: 0.980 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 06:20:37] iteration      670/    1000 | consumed samples:        42880 | elapsed time per iteration (ms): 74584.3 | throughput per GPU (TFLOP/s/GPU): 103.4 | learning rate: 1.371094E-06 | global batch size:    64 | lm loss: 6.635122E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
 [2024-11-28 06:22:11] iteration      671/    1000 | consumed samples:        42944 | elapsed time per iteration (ms): 93345.0 | throughput per GPU (TFLOP/s/GPU): 82.6 | learning rate: 1.364145E-06 | global batch size:    64 | lm loss: 6.329452E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
 [2024-11-28 06:23:45] iteration      672/    1000 | consumed samples:        43008 | elapsed time per iteration (ms): 93954.5 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 1.357207E-06 | global batch size:    64 | lm loss: 6.652647E-01 | loss scale: 1.0 | grad norm: 1.427 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec6aab00] Missing reference picture, default is 65530
[h264 @ 0x555dec6aab00] Missing reference picture, default is 65530
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d95873d100] Missing reference picture, default is 65530
[h264 @ 0x55d95873d100] Missing reference picture, default is 65530
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dec6aab00] Missing reference picture, default is 65530
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] Missing reference picture, default is 65530
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x555dec6aab00] mmco: unref short failure
[h264 @ 0x55d95873d100] Missing reference picture, default is 65530
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] Missing reference picture, default is 65530
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-28 06:25:12] iteration      673/    1000 | consumed samples:        43072 | elapsed time per iteration (ms): 87676.7 | throughput per GPU (TFLOP/s/GPU): 87.9 | learning rate: 1.350283E-06 | global batch size:    64 | lm loss: 6.648919E-01 | loss scale: 1.0 | grad norm: 0.909 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x555ded1fff00] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 06:28:07] iteration      674/    1000 | consumed samples:        43136 | elapsed time per iteration (ms): 175164.1 | throughput per GPU (TFLOP/s/GPU): 44.0 | learning rate: 1.343370E-06 | global batch size:    64 | lm loss: 6.360693E-01 | loss scale: 1.0 | grad norm: 0.905 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 06:29:28] iteration      675/    1000 | consumed samples:        43200 | elapsed time per iteration (ms): 80818.5 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 1.336471E-06 | global batch size:    64 | lm loss: 6.220077E-01 | loss scale: 1.0 | grad norm: 0.859 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 06:31:06] iteration      676/    1000 | consumed samples:        43264 | elapsed time per iteration (ms): 97698.5 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 1.329584E-06 | global batch size:    64 | lm loss: 7.096108E-01 | loss scale: 1.0 | grad norm: 1.075 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-28 06:32:31] iteration      677/    1000 | consumed samples:        43328 | elapsed time per iteration (ms): 84755.2 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 1.322710E-06 | global batch size:    64 | lm loss: 6.861854E-01 | loss scale: 1.0 | grad norm: 0.974 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 06:33:44] iteration      678/    1000 | consumed samples:        43392 | elapsed time per iteration (ms): 73412.7 | throughput per GPU (TFLOP/s/GPU): 105.0 | learning rate: 1.315849E-06 | global batch size:    64 | lm loss: 6.372385E-01 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 06:35:14] iteration      679/    1000 | consumed samples:        43456 | elapsed time per iteration (ms): 90165.8 | throughput per GPU (TFLOP/s/GPU): 85.5 | learning rate: 1.309001E-06 | global batch size:    64 | lm loss: 7.036251E-01 | loss scale: 1.0 | grad norm: 1.326 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-28 06:36:53] iteration      680/    1000 | consumed samples:        43520 | elapsed time per iteration (ms): 98651.2 | throughput per GPU (TFLOP/s/GPU): 78.1 | learning rate: 1.302166E-06 | global batch size:    64 | lm loss: 6.906869E-01 | loss scale: 1.0 | grad norm: 0.888 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-28 06:38:45] iteration      681/    1000 | consumed samples:        43584 | elapsed time per iteration (ms): 112237.4 | throughput per GPU (TFLOP/s/GPU): 68.7 | learning rate: 1.295344E-06 | global batch size:    64 | lm loss: 6.447147E-01 | loss scale: 1.0 | grad norm: 0.741 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 06:40:02] iteration      682/    1000 | consumed samples:        43648 | elapsed time per iteration (ms): 76368.4 | throughput per GPU (TFLOP/s/GPU): 100.9 | learning rate: 1.288535E-06 | global batch size:    64 | lm loss: 6.220345E-01 | loss scale: 1.0 | grad norm: 1.062 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-28 06:41:12] iteration      683/    1000 | consumed samples:        43712 | elapsed time per iteration (ms): 69860.0 | throughput per GPU (TFLOP/s/GPU): 110.3 | learning rate: 1.281739E-06 | global batch size:    64 | lm loss: 6.551774E-01 | loss scale: 1.0 | grad norm: 0.941 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
 [2024-11-28 06:42:27] iteration      684/    1000 | consumed samples:        43776 | elapsed time per iteration (ms): 75777.7 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 1.274957E-06 | global batch size:    64 | lm loss: 6.513221E-01 | loss scale: 1.0 | grad norm: 0.857 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-28 06:43:43] iteration      685/    1000 | consumed samples:        43840 | elapsed time per iteration (ms): 76097.4 | throughput per GPU (TFLOP/s/GPU): 101.3 | learning rate: 1.268188E-06 | global batch size:    64 | lm loss: 6.644771E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 06:45:07] iteration      686/    1000 | consumed samples:        43904 | elapsed time per iteration (ms): 83094.6 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 1.261432E-06 | global batch size:    64 | lm loss: 6.695704E-01 | loss scale: 1.0 | grad norm: 0.789 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
 [2024-11-28 06:46:36] iteration      687/    1000 | consumed samples:        43968 | elapsed time per iteration (ms): 89651.0 | throughput per GPU (TFLOP/s/GPU): 86.0 | learning rate: 1.254690E-06 | global batch size:    64 | lm loss: 6.823508E-01 | loss scale: 1.0 | grad norm: 1.004 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 06:48:00] iteration      688/    1000 | consumed samples:        44032 | elapsed time per iteration (ms): 83603.2 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 1.247961E-06 | global batch size:    64 | lm loss: 6.911366E-01 | loss scale: 1.0 | grad norm: 1.534 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 06:49:24] iteration      689/    1000 | consumed samples:        44096 | elapsed time per iteration (ms): 84296.1 | throughput per GPU (TFLOP/s/GPU): 91.4 | learning rate: 1.241247E-06 | global batch size:    64 | lm loss: 6.808408E-01 | loss scale: 1.0 | grad norm: 0.885 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-28 06:50:49] iteration      690/    1000 | consumed samples:        44160 | elapsed time per iteration (ms): 85310.0 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 1.234546E-06 | global batch size:    64 | lm loss: 6.851189E-01 | loss scale: 1.0 | grad norm: 0.967 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded624e00] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
 [2024-11-28 06:52:15] iteration      691/    1000 | consumed samples:        44224 | elapsed time per iteration (ms): 85594.1 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 1.227858E-06 | global batch size:    64 | lm loss: 6.419601E-01 | loss scale: 1.0 | grad norm: 0.782 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 06:53:57] iteration      692/    1000 | consumed samples:        44288 | elapsed time per iteration (ms): 102236.7 | throughput per GPU (TFLOP/s/GPU): 75.4 | learning rate: 1.221185E-06 | global batch size:    64 | lm loss: 6.681433E-01 | loss scale: 1.0 | grad norm: 0.821 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-28 06:55:54] iteration      693/    1000 | consumed samples:        44352 | elapsed time per iteration (ms): 116496.4 | throughput per GPU (TFLOP/s/GPU): 66.2 | learning rate: 1.214525E-06 | global batch size:    64 | lm loss: 6.374875E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 06:57:15] iteration      694/    1000 | consumed samples:        44416 | elapsed time per iteration (ms): 81088.3 | throughput per GPU (TFLOP/s/GPU): 95.1 | learning rate: 1.207880E-06 | global batch size:    64 | lm loss: 6.311159E-01 | loss scale: 1.0 | grad norm: 1.018 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
 [2024-11-28 06:58:26] iteration      695/    1000 | consumed samples:        44480 | elapsed time per iteration (ms): 71111.0 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 1.201249E-06 | global batch size:    64 | lm loss: 6.946983E-01 | loss scale: 1.0 | grad norm: 0.929 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
processed_samples 2500 unjoint_samples 2500 joint_samples 168 [120290, 108487]
processed_samples 2500 unjoint_samples 2500 joint_samples 173 [88757, 63354]
processed_samples 2500 unjoint_samples 2500 joint_samples 169 [110246, 71865]
processed_samples 2500 unjoint_samples 2500 joint_samples 180 [102941, 84465]
processed_samples 2500 unjoint_samples 2500 joint_samples 174 [125182, 14057]
processed_samples 2500 unjoint_samples 2500 joint_samples 171 [96609, 73106]
processed_samples 2500 unjoint_samples 2500 joint_samples 178 [97503, 125668]
processed_samples 2500 unjoint_samples 2500 joint_samples 168 [120290, 108487]
processed_samples 2500 unjoint_samples 2500 joint_samples 173 [88757, 63354]
processed_samples 2500 unjoint_samples 2500 joint_samples 169 [110246, 71865]
processed_samples 2500 unjoint_samples 2500 joint_samples 180 [102941, 84465]
processed_samples 2500 unjoint_samples 2500 joint_samples 174 [125182, 14057]
processed_samples 2500 unjoint_samples 2500 joint_samples 171 [96609, 73106]
processed_samples 2500 unjoint_samples 2500 joint_samples 178 [97503, 125668]
processed_samples 2500 unjoint_samples 2500 joint_samples 179 [119274, 75578]
processed_samples 2500 unjoint_samples 2500 joint_samples 179 [119274, 75578]
 [2024-11-28 06:59:58] iteration      696/    1000 | consumed samples:        44544 | elapsed time per iteration (ms): 92412.2 | throughput per GPU (TFLOP/s/GPU): 83.4 | learning rate: 1.194631E-06 | global batch size:    64 | lm loss: 6.888507E-01 | loss scale: 1.0 | grad norm: 1.813 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-28 07:01:23] iteration      697/    1000 | consumed samples:        44608 | elapsed time per iteration (ms): 84033.4 | throughput per GPU (TFLOP/s/GPU): 91.7 | learning rate: 1.188028E-06 | global batch size:    64 | lm loss: 6.656145E-01 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-28 07:02:45] iteration      698/    1000 | consumed samples:        44672 | elapsed time per iteration (ms): 82325.8 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 1.181440E-06 | global batch size:    64 | lm loss: 6.377969E-01 | loss scale: 1.0 | grad norm: 0.987 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-28 07:04:21] iteration      699/    1000 | consumed samples:        44736 | elapsed time per iteration (ms): 96343.7 | throughput per GPU (TFLOP/s/GPU): 80.0 | learning rate: 1.174865E-06 | global batch size:    64 | lm loss: 6.097525E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-28 07:05:35] iteration      700/    1000 | consumed samples:        44800 | elapsed time per iteration (ms): 73619.4 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 1.168305E-06 | global batch size:    64 | lm loss: 5.898384E-01 | loss scale: 1.0 | grad norm: 0.878 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (266265.05, 266265.44)
 [2024-11-28 07:11:23] iteration      701/    1000 | consumed samples:        44864 | elapsed time per iteration (ms): 82012.4 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 1.161760E-06 | global batch size:    64 | lm loss: 7.252907E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-28 07:12:57] iteration      702/    1000 | consumed samples:        44928 | elapsed time per iteration (ms): 93619.3 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 1.155229E-06 | global batch size:    64 | lm loss: 6.619033E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-28 07:15:00] iteration      703/    1000 | consumed samples:        44992 | elapsed time per iteration (ms): 122806.3 | throughput per GPU (TFLOP/s/GPU): 62.8 | learning rate: 1.148713E-06 | global batch size:    64 | lm loss: 6.486115E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-28 07:16:23] iteration      704/    1000 | consumed samples:        45056 | elapsed time per iteration (ms): 83280.2 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 1.142211E-06 | global batch size:    64 | lm loss: 6.365772E-01 | loss scale: 1.0 | grad norm: 1.116 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df1590d80] mmco: unref short failure
[h264 @ 0x555df1590d80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 07:18:33] iteration      705/    1000 | consumed samples:        45120 | elapsed time per iteration (ms): 130359.9 | throughput per GPU (TFLOP/s/GPU): 59.1 | learning rate: 1.135724E-06 | global batch size:    64 | lm loss: 6.714696E-01 | loss scale: 1.0 | grad norm: 1.007 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 07:20:11] iteration      706/    1000 | consumed samples:        45184 | elapsed time per iteration (ms): 97891.1 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 1.129252E-06 | global batch size:    64 | lm loss: 7.067757E-01 | loss scale: 1.0 | grad norm: 0.749 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
 [2024-11-28 07:21:25] iteration      707/    1000 | consumed samples:        45248 | elapsed time per iteration (ms): 73520.2 | throughput per GPU (TFLOP/s/GPU): 104.8 | learning rate: 1.122795E-06 | global batch size:    64 | lm loss: 6.495814E-01 | loss scale: 1.0 | grad norm: 1.062 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d959a05200] mmco: unref short failure
 [2024-11-28 07:22:54] iteration      708/    1000 | consumed samples:        45312 | elapsed time per iteration (ms): 88974.9 | throughput per GPU (TFLOP/s/GPU): 86.6 | learning rate: 1.116353E-06 | global batch size:    64 | lm loss: 6.196807E-01 | loss scale: 1.0 | grad norm: 0.756 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 07:24:42] iteration      709/    1000 | consumed samples:        45376 | elapsed time per iteration (ms): 108553.4 | throughput per GPU (TFLOP/s/GPU): 71.0 | learning rate: 1.109926E-06 | global batch size:    64 | lm loss: 6.966307E-01 | loss scale: 1.0 | grad norm: 1.342 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-28 07:26:25] iteration      710/    1000 | consumed samples:        45440 | elapsed time per iteration (ms): 102631.6 | throughput per GPU (TFLOP/s/GPU): 75.1 | learning rate: 1.103514E-06 | global batch size:    64 | lm loss: 6.217573E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 07:27:56] iteration      711/    1000 | consumed samples:        45504 | elapsed time per iteration (ms): 91479.5 | throughput per GPU (TFLOP/s/GPU): 84.3 | learning rate: 1.097117E-06 | global batch size:    64 | lm loss: 6.903121E-01 | loss scale: 1.0 | grad norm: 0.802 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-28 07:29:15] iteration      712/    1000 | consumed samples:        45568 | elapsed time per iteration (ms): 78703.9 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 1.090736E-06 | global batch size:    64 | lm loss: 7.672721E-01 | loss scale: 1.0 | grad norm: 0.860 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-28 07:30:49] iteration      713/    1000 | consumed samples:        45632 | elapsed time per iteration (ms): 94197.3 | throughput per GPU (TFLOP/s/GPU): 81.8 | learning rate: 1.084370E-06 | global batch size:    64 | lm loss: 6.627736E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-28 07:32:21] iteration      714/    1000 | consumed samples:        45696 | elapsed time per iteration (ms): 91448.9 | throughput per GPU (TFLOP/s/GPU): 84.3 | learning rate: 1.078019E-06 | global batch size:    64 | lm loss: 6.277137E-01 | loss scale: 1.0 | grad norm: 0.870 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 07:35:48] iteration      715/    1000 | consumed samples:        45760 | elapsed time per iteration (ms): 207227.5 | throughput per GPU (TFLOP/s/GPU): 37.2 | learning rate: 1.071683E-06 | global batch size:    64 | lm loss: 6.213148E-01 | loss scale: 1.0 | grad norm: 0.898 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 07:37:11] iteration      716/    1000 | consumed samples:        45824 | elapsed time per iteration (ms): 82861.5 | throughput per GPU (TFLOP/s/GPU): 93.0 | learning rate: 1.065363E-06 | global batch size:    64 | lm loss: 6.508120E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 07:38:56] iteration      717/    1000 | consumed samples:        45888 | elapsed time per iteration (ms): 104658.6 | throughput per GPU (TFLOP/s/GPU): 73.7 | learning rate: 1.059059E-06 | global batch size:    64 | lm loss: 6.287351E-01 | loss scale: 1.0 | grad norm: 0.802 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555ded63c840] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded0cd540] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
 [2024-11-28 07:40:46] iteration      718/    1000 | consumed samples:        45952 | elapsed time per iteration (ms): 110547.3 | throughput per GPU (TFLOP/s/GPU): 69.7 | learning rate: 1.052770E-06 | global batch size:    64 | lm loss: 6.484494E-01 | loss scale: 1.0 | grad norm: 1.095 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
 [2024-11-28 07:41:57] iteration      719/    1000 | consumed samples:        46016 | elapsed time per iteration (ms): 71255.5 | throughput per GPU (TFLOP/s/GPU): 108.2 | learning rate: 1.046497E-06 | global batch size:    64 | lm loss: 6.334097E-01 | loss scale: 1.0 | grad norm: 0.807 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 07:43:07] iteration      720/    1000 | consumed samples:        46080 | elapsed time per iteration (ms): 69978.1 | throughput per GPU (TFLOP/s/GPU): 110.2 | learning rate: 1.040240E-06 | global batch size:    64 | lm loss: 6.164656E-01 | loss scale: 1.0 | grad norm: 1.219 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 07:44:15] iteration      721/    1000 | consumed samples:        46144 | elapsed time per iteration (ms): 68071.8 | throughput per GPU (TFLOP/s/GPU): 113.2 | learning rate: 1.033999E-06 | global batch size:    64 | lm loss: 6.665082E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
 [2024-11-28 07:45:53] iteration      722/    1000 | consumed samples:        46208 | elapsed time per iteration (ms): 97636.9 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 1.027773E-06 | global batch size:    64 | lm loss: 7.176014E-01 | loss scale: 1.0 | grad norm: 0.961 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 07:47:09] iteration      723/    1000 | consumed samples:        46272 | elapsed time per iteration (ms): 75952.6 | throughput per GPU (TFLOP/s/GPU): 101.5 | learning rate: 1.021564E-06 | global batch size:    64 | lm loss: 7.123392E-01 | loss scale: 1.0 | grad norm: 1.054 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec794000] mmco: unref short failure
[h264 @ 0x555dec794000] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95692cb80] mmco: unref short failure
[h264 @ 0x55d95692cb80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-28 07:48:34] iteration      724/    1000 | consumed samples:        46336 | elapsed time per iteration (ms): 85202.5 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 1.015370E-06 | global batch size:    64 | lm loss: 6.468613E-01 | loss scale: 1.0 | grad norm: 0.700 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 07:50:06] iteration      725/    1000 | consumed samples:        46400 | elapsed time per iteration (ms): 91867.5 | throughput per GPU (TFLOP/s/GPU): 83.9 | learning rate: 1.009193E-06 | global batch size:    64 | lm loss: 6.567154E-01 | loss scale: 1.0 | grad norm: 0.925 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 07:51:49] iteration      726/    1000 | consumed samples:        46464 | elapsed time per iteration (ms): 103112.7 | throughput per GPU (TFLOP/s/GPU): 74.8 | learning rate: 1.003032E-06 | global batch size:    64 | lm loss: 6.286631E-01 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-28 07:53:26] iteration      727/    1000 | consumed samples:        46528 | elapsed time per iteration (ms): 96307.4 | throughput per GPU (TFLOP/s/GPU): 80.0 | learning rate: 9.968868E-07 | global batch size:    64 | lm loss: 5.987281E-01 | loss scale: 1.0 | grad norm: 0.763 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 07:54:48] iteration      728/    1000 | consumed samples:        46592 | elapsed time per iteration (ms): 81878.8 | throughput per GPU (TFLOP/s/GPU): 94.1 | learning rate: 9.907581E-07 | global batch size:    64 | lm loss: 6.290482E-01 | loss scale: 1.0 | grad norm: 0.782 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 07:56:38] iteration      729/    1000 | consumed samples:        46656 | elapsed time per iteration (ms): 109994.2 | throughput per GPU (TFLOP/s/GPU): 70.1 | learning rate: 9.846458E-07 | global batch size:    64 | lm loss: 5.959722E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
 [2024-11-28 07:58:01] iteration      730/    1000 | consumed samples:        46720 | elapsed time per iteration (ms): 83517.2 | throughput per GPU (TFLOP/s/GPU): 92.3 | learning rate: 9.785499E-07 | global batch size:    64 | lm loss: 6.537108E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-28 07:59:13] iteration      731/    1000 | consumed samples:        46784 | elapsed time per iteration (ms): 71873.9 | throughput per GPU (TFLOP/s/GPU): 107.3 | learning rate: 9.724704E-07 | global batch size:    64 | lm loss: 6.193046E-01 | loss scale: 1.0 | grad norm: 0.769 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 08:00:44] iteration      732/    1000 | consumed samples:        46848 | elapsed time per iteration (ms): 91514.7 | throughput per GPU (TFLOP/s/GPU): 84.2 | learning rate: 9.664075E-07 | global batch size:    64 | lm loss: 6.407279E-01 | loss scale: 1.0 | grad norm: 1.185 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 08:02:07] iteration      733/    1000 | consumed samples:        46912 | elapsed time per iteration (ms): 82064.7 | throughput per GPU (TFLOP/s/GPU): 93.9 | learning rate: 9.603612E-07 | global batch size:    64 | lm loss: 6.269274E-01 | loss scale: 1.0 | grad norm: 0.800 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-28 08:03:40] iteration      734/    1000 | consumed samples:        46976 | elapsed time per iteration (ms): 93712.0 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 9.543316E-07 | global batch size:    64 | lm loss: 5.805084E-01 | loss scale: 1.0 | grad norm: 0.878 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-28 08:05:36] iteration      735/    1000 | consumed samples:        47040 | elapsed time per iteration (ms): 116175.3 | throughput per GPU (TFLOP/s/GPU): 66.4 | learning rate: 9.483188E-07 | global batch size:    64 | lm loss: 6.738163E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x555ded0fd000] mmco: unref short failure
 [2024-11-28 08:06:55] iteration      736/    1000 | consumed samples:        47104 | elapsed time per iteration (ms): 78504.1 | throughput per GPU (TFLOP/s/GPU): 98.2 | learning rate: 9.423227E-07 | global batch size:    64 | lm loss: 6.742063E-01 | loss scale: 1.0 | grad norm: 0.895 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-28 08:08:16] iteration      737/    1000 | consumed samples:        47168 | elapsed time per iteration (ms): 80642.9 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 9.363435E-07 | global batch size:    64 | lm loss: 6.592349E-01 | loss scale: 1.0 | grad norm: 0.917 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-28 08:09:46] iteration      738/    1000 | consumed samples:        47232 | elapsed time per iteration (ms): 90072.0 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 9.303812E-07 | global batch size:    64 | lm loss: 6.529440E-01 | loss scale: 1.0 | grad norm: 0.780 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 08:11:16] iteration      739/    1000 | consumed samples:        47296 | elapsed time per iteration (ms): 89972.7 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 9.244359E-07 | global batch size:    64 | lm loss: 6.460364E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 08:12:50] iteration      740/    1000 | consumed samples:        47360 | elapsed time per iteration (ms): 94367.3 | throughput per GPU (TFLOP/s/GPU): 81.7 | learning rate: 9.185077E-07 | global batch size:    64 | lm loss: 7.104200E-01 | loss scale: 1.0 | grad norm: 3.827 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-28 08:14:11] iteration      741/    1000 | consumed samples:        47424 | elapsed time per iteration (ms): 81259.1 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 9.125966E-07 | global batch size:    64 | lm loss: 7.529110E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 08:15:45] iteration      742/    1000 | consumed samples:        47488 | elapsed time per iteration (ms): 93219.8 | throughput per GPU (TFLOP/s/GPU): 82.7 | learning rate: 9.067026E-07 | global batch size:    64 | lm loss: 6.652069E-01 | loss scale: 1.0 | grad norm: 1.478 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 08:17:38] iteration      743/    1000 | consumed samples:        47552 | elapsed time per iteration (ms): 113535.7 | throughput per GPU (TFLOP/s/GPU): 67.9 | learning rate: 9.008259E-07 | global batch size:    64 | lm loss: 6.635025E-01 | loss scale: 1.0 | grad norm: 1.056 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
 [2024-11-28 08:19:14] iteration      744/    1000 | consumed samples:        47616 | elapsed time per iteration (ms): 95812.0 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 8.949665E-07 | global batch size:    64 | lm loss: 6.736272E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x55d9575fd4c0] mmco: unref short failure
[h264 @ 0x555dedf05880] Missing reference picture, default is 65530
[h264 @ 0x555dedf05880] Missing reference picture, default is 65530
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] Missing reference picture, default is 65530
[h264 @ 0x555dedf05880] Missing reference picture, default is 65530
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959cd93c0] Missing reference picture, default is 65530
[h264 @ 0x55d959cd93c0] Missing reference picture, default is 65530
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] Missing reference picture, default is 65530
[h264 @ 0x55d959cd93c0] Missing reference picture, default is 65530
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-28 08:20:29] iteration      745/    1000 | consumed samples:        47680 | elapsed time per iteration (ms): 75295.9 | throughput per GPU (TFLOP/s/GPU): 102.4 | learning rate: 8.891245E-07 | global batch size:    64 | lm loss: 6.412493E-01 | loss scale: 1.0 | grad norm: 1.007 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-28 08:21:53] iteration      746/    1000 | consumed samples:        47744 | elapsed time per iteration (ms): 83674.2 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 8.832998E-07 | global batch size:    64 | lm loss: 6.288611E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 08:23:13] iteration      747/    1000 | consumed samples:        47808 | elapsed time per iteration (ms): 79617.4 | throughput per GPU (TFLOP/s/GPU): 96.8 | learning rate: 8.774927E-07 | global batch size:    64 | lm loss: 6.212032E-01 | loss scale: 1.0 | grad norm: 0.787 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x55d95c2eb200] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
 [2024-11-28 08:24:28] iteration      748/    1000 | consumed samples:        47872 | elapsed time per iteration (ms): 75213.3 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 8.717031E-07 | global batch size:    64 | lm loss: 6.660864E-01 | loss scale: 1.0 | grad norm: 0.858 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d957a91640] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
 [2024-11-28 08:26:48] iteration      749/    1000 | consumed samples:        47936 | elapsed time per iteration (ms): 140548.6 | throughput per GPU (TFLOP/s/GPU): 54.8 | learning rate: 8.659311E-07 | global batch size:    64 | lm loss: 7.050755E-01 | loss scale: 1.0 | grad norm: 0.807 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 08:28:15] iteration      750/    1000 | consumed samples:        48000 | elapsed time per iteration (ms): 86971.5 | throughput per GPU (TFLOP/s/GPU): 88.6 | learning rate: 8.601767E-07 | global batch size:    64 | lm loss: 6.688051E-01 | loss scale: 1.0 | grad norm: 0.802 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 08:29:58] iteration      751/    1000 | consumed samples:        48064 | elapsed time per iteration (ms): 102811.2 | throughput per GPU (TFLOP/s/GPU): 75.0 | learning rate: 8.544401E-07 | global batch size:    64 | lm loss: 6.897563E-01 | loss scale: 1.0 | grad norm: 0.980 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 08:31:19] iteration      752/    1000 | consumed samples:        48128 | elapsed time per iteration (ms): 80807.5 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 8.487213E-07 | global batch size:    64 | lm loss: 6.634867E-01 | loss scale: 1.0 | grad norm: 1.802 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 08:32:48] iteration      753/    1000 | consumed samples:        48192 | elapsed time per iteration (ms): 89405.0 | throughput per GPU (TFLOP/s/GPU): 86.2 | learning rate: 8.430203E-07 | global batch size:    64 | lm loss: 7.193509E-01 | loss scale: 1.0 | grad norm: 1.011 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
 [2024-11-28 08:35:03] iteration      754/    1000 | consumed samples:        48256 | elapsed time per iteration (ms): 134288.4 | throughput per GPU (TFLOP/s/GPU): 57.4 | learning rate: 8.373373E-07 | global batch size:    64 | lm loss: 7.454703E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 08:36:27] iteration      755/    1000 | consumed samples:        48320 | elapsed time per iteration (ms): 84129.7 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 8.316722E-07 | global batch size:    64 | lm loss: 6.674823E-01 | loss scale: 1.0 | grad norm: 0.780 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-28 08:38:34] iteration      756/    1000 | consumed samples:        48384 | elapsed time per iteration (ms): 127023.5 | throughput per GPU (TFLOP/s/GPU): 60.7 | learning rate: 8.260251E-07 | global batch size:    64 | lm loss: 6.470302E-01 | loss scale: 1.0 | grad norm: 1.098 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
 [2024-11-28 08:40:07] iteration      757/    1000 | consumed samples:        48448 | elapsed time per iteration (ms): 92857.7 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 8.203961E-07 | global batch size:    64 | lm loss: 6.956238E-01 | loss scale: 1.0 | grad norm: 0.936 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x555dede22280] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 08:41:32] iteration      758/    1000 | consumed samples:        48512 | elapsed time per iteration (ms): 85650.1 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 8.147852E-07 | global batch size:    64 | lm loss: 6.154037E-01 | loss scale: 1.0 | grad norm: 0.838 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
 [2024-11-28 08:42:45] iteration      759/    1000 | consumed samples:        48576 | elapsed time per iteration (ms): 72871.9 | throughput per GPU (TFLOP/s/GPU): 105.8 | learning rate: 8.091926E-07 | global batch size:    64 | lm loss: 6.568743E-01 | loss scale: 1.0 | grad norm: 1.079 | number of skipped iterations:   0 | number of nan iterations:   0 |
Token indices sequence length is longer than the specified maximum sequence length for this model (137911 > 131072). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (137911 > 131072). Running this sequence through the model will result in indexing errors
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955cbb4c0] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 08:44:09] iteration      760/    1000 | consumed samples:        48640 | elapsed time per iteration (ms): 83541.6 | throughput per GPU (TFLOP/s/GPU): 92.3 | learning rate: 8.036182E-07 | global batch size:    64 | lm loss: 6.524685E-01 | loss scale: 1.0 | grad norm: 0.898 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-28 08:45:32] iteration      761/    1000 | consumed samples:        48704 | elapsed time per iteration (ms): 83282.4 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 7.980621E-07 | global batch size:    64 | lm loss: 7.427295E-01 | loss scale: 1.0 | grad norm: 1.038 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-28 08:47:10] iteration      762/    1000 | consumed samples:        48768 | elapsed time per iteration (ms): 97565.2 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 7.925244E-07 | global batch size:    64 | lm loss: 6.557251E-01 | loss scale: 1.0 | grad norm: 1.083 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
 [2024-11-28 08:49:03] iteration      763/    1000 | consumed samples:        48832 | elapsed time per iteration (ms): 112877.4 | throughput per GPU (TFLOP/s/GPU): 68.3 | learning rate: 7.870051E-07 | global batch size:    64 | lm loss: 7.390345E-01 | loss scale: 1.0 | grad norm: 0.930 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
 [2024-11-28 08:50:22] iteration      764/    1000 | consumed samples:        48896 | elapsed time per iteration (ms): 79888.6 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 7.815044E-07 | global batch size:    64 | lm loss: 5.950512E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-28 08:52:05] iteration      765/    1000 | consumed samples:        48960 | elapsed time per iteration (ms): 102480.9 | throughput per GPU (TFLOP/s/GPU): 75.2 | learning rate: 7.760222E-07 | global batch size:    64 | lm loss: 6.820820E-01 | loss scale: 1.0 | grad norm: 1.007 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 08:53:24] iteration      766/    1000 | consumed samples:        49024 | elapsed time per iteration (ms): 79224.7 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 7.705586E-07 | global batch size:    64 | lm loss: 7.155175E-01 | loss scale: 1.0 | grad norm: 1.681 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 08:54:44] iteration      767/    1000 | consumed samples:        49088 | elapsed time per iteration (ms): 80096.9 | throughput per GPU (TFLOP/s/GPU): 96.2 | learning rate: 7.651136E-07 | global batch size:    64 | lm loss: 7.093288E-01 | loss scale: 1.0 | grad norm: 1.001 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
 [2024-11-28 08:57:45] iteration      768/    1000 | consumed samples:        49152 | elapsed time per iteration (ms): 180236.9 | throughput per GPU (TFLOP/s/GPU): 42.8 | learning rate: 7.596874E-07 | global batch size:    64 | lm loss: 6.092068E-01 | loss scale: 1.0 | grad norm: 0.760 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 08:59:23] iteration      769/    1000 | consumed samples:        49216 | elapsed time per iteration (ms): 98856.7 | throughput per GPU (TFLOP/s/GPU): 78.0 | learning rate: 7.542799E-07 | global batch size:    64 | lm loss: 6.536492E-01 | loss scale: 1.0 | grad norm: 0.797 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 09:01:05] iteration      770/    1000 | consumed samples:        49280 | elapsed time per iteration (ms): 101480.6 | throughput per GPU (TFLOP/s/GPU): 76.0 | learning rate: 7.488913E-07 | global batch size:    64 | lm loss: 6.581574E-01 | loss scale: 1.0 | grad norm: 0.962 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
 [2024-11-28 09:02:15] iteration      771/    1000 | consumed samples:        49344 | elapsed time per iteration (ms): 70333.8 | throughput per GPU (TFLOP/s/GPU): 109.6 | learning rate: 7.435216E-07 | global batch size:    64 | lm loss: 6.496270E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 09:03:58] iteration      772/    1000 | consumed samples:        49408 | elapsed time per iteration (ms): 102719.8 | throughput per GPU (TFLOP/s/GPU): 75.0 | learning rate: 7.381709E-07 | global batch size:    64 | lm loss: 6.505972E-01 | loss scale: 1.0 | grad norm: 0.976 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dec9973c0] [h264 @ 0x55d9574d90c0] mmco: unref short failure
mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 09:05:42] iteration      773/    1000 | consumed samples:        49472 | elapsed time per iteration (ms): 104214.6 | throughput per GPU (TFLOP/s/GPU): 74.0 | learning rate: 7.328391E-07 | global batch size:    64 | lm loss: 6.121303E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-28 09:07:11] iteration      774/    1000 | consumed samples:        49536 | elapsed time per iteration (ms): 88647.4 | throughput per GPU (TFLOP/s/GPU): 87.0 | learning rate: 7.275264E-07 | global batch size:    64 | lm loss: 6.867324E-01 | loss scale: 1.0 | grad norm: 0.915 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
 [2024-11-28 09:08:24] iteration      775/    1000 | consumed samples:        49600 | elapsed time per iteration (ms): 73074.2 | throughput per GPU (TFLOP/s/GPU): 105.5 | learning rate: 7.222328E-07 | global batch size:    64 | lm loss: 6.488966E-01 | loss scale: 1.0 | grad norm: 0.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x55d957a1cec0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 09:10:06] iteration      776/    1000 | consumed samples:        49664 | elapsed time per iteration (ms): 102250.9 | throughput per GPU (TFLOP/s/GPU): 75.4 | learning rate: 7.169584E-07 | global batch size:    64 | lm loss: 6.472382E-01 | loss scale: 1.0 | grad norm: 1.003 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 09:11:37] iteration      777/    1000 | consumed samples:        49728 | elapsed time per iteration (ms): 90431.0 | throughput per GPU (TFLOP/s/GPU): 85.2 | learning rate: 7.117032E-07 | global batch size:    64 | lm loss: 7.486657E-01 | loss scale: 1.0 | grad norm: 1.617 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
 [2024-11-28 09:13:11] iteration      778/    1000 | consumed samples:        49792 | elapsed time per iteration (ms): 94071.2 | throughput per GPU (TFLOP/s/GPU): 81.9 | learning rate: 7.064673E-07 | global batch size:    64 | lm loss: 6.917417E-01 | loss scale: 1.0 | grad norm: 1.739 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 09:14:34] iteration      779/    1000 | consumed samples:        49856 | elapsed time per iteration (ms): 83680.9 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 7.012508E-07 | global batch size:    64 | lm loss: 6.594355E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 09:16:00] iteration      780/    1000 | consumed samples:        49920 | elapsed time per iteration (ms): 85537.9 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 6.960536E-07 | global batch size:    64 | lm loss: 6.973290E-01 | loss scale: 1.0 | grad norm: 1.559 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
 [2024-11-28 09:17:25] iteration      781/    1000 | consumed samples:        49984 | elapsed time per iteration (ms): 84911.3 | throughput per GPU (TFLOP/s/GPU): 90.8 | learning rate: 6.908759E-07 | global batch size:    64 | lm loss: 6.615264E-01 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d9574d90c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
 [2024-11-28 09:19:07] iteration      782/    1000 | consumed samples:        50048 | elapsed time per iteration (ms): 101571.3 | throughput per GPU (TFLOP/s/GPU): 75.9 | learning rate: 6.857177E-07 | global batch size:    64 | lm loss: 6.533257E-01 | loss scale: 1.0 | grad norm: 0.905 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-28 09:20:28] iteration      783/    1000 | consumed samples:        50112 | elapsed time per iteration (ms): 81688.9 | throughput per GPU (TFLOP/s/GPU): 94.4 | learning rate: 6.805790E-07 | global batch size:    64 | lm loss: 6.432073E-01 | loss scale: 1.0 | grad norm: 0.974 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 09:22:00] iteration      784/    1000 | consumed samples:        50176 | elapsed time per iteration (ms): 91262.4 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 6.754599E-07 | global batch size:    64 | lm loss: 7.190090E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 09:23:47] iteration      785/    1000 | consumed samples:        50240 | elapsed time per iteration (ms): 107395.2 | throughput per GPU (TFLOP/s/GPU): 71.8 | learning rate: 6.703605E-07 | global batch size:    64 | lm loss: 6.706704E-01 | loss scale: 1.0 | grad norm: 0.855 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 09:25:05] iteration      786/    1000 | consumed samples:        50304 | elapsed time per iteration (ms): 78147.3 | throughput per GPU (TFLOP/s/GPU): 98.6 | learning rate: 6.652809E-07 | global batch size:    64 | lm loss: 7.410614E-01 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
 [2024-11-28 09:26:31] iteration      787/    1000 | consumed samples:        50368 | elapsed time per iteration (ms): 85945.0 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 6.602210E-07 | global batch size:    64 | lm loss: 6.612120E-01 | loss scale: 1.0 | grad norm: 1.331 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
 [2024-11-28 09:27:49] iteration      788/    1000 | consumed samples:        50432 | elapsed time per iteration (ms): 77740.8 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 6.551809E-07 | global batch size:    64 | lm loss: 7.221023E-01 | loss scale: 1.0 | grad norm: 0.928 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 09:29:00] iteration      789/    1000 | consumed samples:        50496 | elapsed time per iteration (ms): 71128.4 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 6.501607E-07 | global batch size:    64 | lm loss: 7.365326E-01 | loss scale: 1.0 | grad norm: 1.031 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
 [2024-11-28 09:30:29] iteration      790/    1000 | consumed samples:        50560 | elapsed time per iteration (ms): 89353.3 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 6.451604E-07 | global batch size:    64 | lm loss: 6.473880E-01 | loss scale: 1.0 | grad norm: 0.865 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
 [2024-11-28 09:31:45] iteration      791/    1000 | consumed samples:        50624 | elapsed time per iteration (ms): 75220.3 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 6.401801E-07 | global batch size:    64 | lm loss: 6.595445E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95a292180] mmco: unref short failure
 [2024-11-28 09:33:20] iteration      792/    1000 | consumed samples:        50688 | elapsed time per iteration (ms): 95120.8 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 6.352198E-07 | global batch size:    64 | lm loss: 6.879074E-01 | loss scale: 1.0 | grad norm: 0.819 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
 [2024-11-28 09:34:50] iteration      793/    1000 | consumed samples:        50752 | elapsed time per iteration (ms): 90072.9 | throughput per GPU (TFLOP/s/GPU): 85.6 | learning rate: 6.302797E-07 | global batch size:    64 | lm loss: 5.921465E-01 | loss scale: 1.0 | grad norm: 0.869 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 09:36:20] iteration      794/    1000 | consumed samples:        50816 | elapsed time per iteration (ms): 89985.7 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 6.253596E-07 | global batch size:    64 | lm loss: 7.055615E-01 | loss scale: 1.0 | grad norm: 0.736 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
 [2024-11-28 09:38:08] iteration      795/    1000 | consumed samples:        50880 | elapsed time per iteration (ms): 107875.5 | throughput per GPU (TFLOP/s/GPU): 71.5 | learning rate: 6.204598E-07 | global batch size:    64 | lm loss: 6.044904E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
 [2024-11-28 09:39:48] iteration      796/    1000 | consumed samples:        50944 | elapsed time per iteration (ms): 100652.8 | throughput per GPU (TFLOP/s/GPU): 76.6 | learning rate: 6.155801E-07 | global batch size:    64 | lm loss: 6.538367E-01 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
[h264 @ 0x55d956f98f80] mmco: unref short failure
 [2024-11-28 09:41:02] iteration      797/    1000 | consumed samples:        51008 | elapsed time per iteration (ms): 73916.1 | throughput per GPU (TFLOP/s/GPU): 104.3 | learning rate: 6.107208E-07 | global batch size:    64 | lm loss: 6.171812E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-28 09:42:24] iteration      798/    1000 | consumed samples:        51072 | elapsed time per iteration (ms): 81723.2 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 6.058818E-07 | global batch size:    64 | lm loss: 6.591862E-01 | loss scale: 1.0 | grad norm: 0.869 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
 [2024-11-28 09:43:44] iteration      799/    1000 | consumed samples:        51136 | elapsed time per iteration (ms): 79627.9 | throughput per GPU (TFLOP/s/GPU): 96.8 | learning rate: 6.010633E-07 | global batch size:    64 | lm loss: 6.704946E-01 | loss scale: 1.0 | grad norm: 1.459 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-28 09:44:55] iteration      800/    1000 | consumed samples:        51200 | elapsed time per iteration (ms): 70975.0 | throughput per GPU (TFLOP/s/GPU): 108.6 | learning rate: 5.962651E-07 | global batch size:    64 | lm loss: 7.106262E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (154771.64, 154771.93)
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 09:48:58] iteration      801/    1000 | consumed samples:        51264 | elapsed time per iteration (ms): 88671.1 | throughput per GPU (TFLOP/s/GPU): 86.9 | learning rate: 5.914875E-07 | global batch size:    64 | lm loss: 7.269337E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
 [2024-11-28 09:50:27] iteration      802/    1000 | consumed samples:        51328 | elapsed time per iteration (ms): 88770.2 | throughput per GPU (TFLOP/s/GPU): 86.8 | learning rate: 5.867304E-07 | global batch size:    64 | lm loss: 6.636480E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
 [2024-11-28 09:52:17] iteration      803/    1000 | consumed samples:        51392 | elapsed time per iteration (ms): 109923.6 | throughput per GPU (TFLOP/s/GPU): 70.1 | learning rate: 5.819938E-07 | global batch size:    64 | lm loss: 7.249713E-01 | loss scale: 1.0 | grad norm: 0.836 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
 [2024-11-28 09:53:48] iteration      804/    1000 | consumed samples:        51456 | elapsed time per iteration (ms): 91267.2 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 5.772780E-07 | global batch size:    64 | lm loss: 6.352175E-01 | loss scale: 1.0 | grad norm: 0.905 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
 [2024-11-28 09:55:04] iteration      805/    1000 | consumed samples:        51520 | elapsed time per iteration (ms): 76083.1 | throughput per GPU (TFLOP/s/GPU): 101.3 | learning rate: 5.725828E-07 | global batch size:    64 | lm loss: 6.475290E-01 | loss scale: 1.0 | grad norm: 0.971 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
 [2024-11-28 09:56:38] iteration      806/    1000 | consumed samples:        51584 | elapsed time per iteration (ms): 94181.1 | throughput per GPU (TFLOP/s/GPU): 81.8 | learning rate: 5.679084E-07 | global batch size:    64 | lm loss: 7.037023E-01 | loss scale: 1.0 | grad norm: 1.058 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x555dece38b80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
 [2024-11-28 09:57:48] iteration      807/    1000 | consumed samples:        51648 | elapsed time per iteration (ms): 69970.7 | throughput per GPU (TFLOP/s/GPU): 110.2 | learning rate: 5.632547E-07 | global batch size:    64 | lm loss: 6.819202E-01 | loss scale: 1.0 | grad norm: 1.416 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 09:59:06] iteration      808/    1000 | consumed samples:        51712 | elapsed time per iteration (ms): 77503.9 | throughput per GPU (TFLOP/s/GPU): 99.5 | learning rate: 5.586219E-07 | global batch size:    64 | lm loss: 6.660787E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-28 10:01:01] iteration      809/    1000 | consumed samples:        51776 | elapsed time per iteration (ms): 115416.3 | throughput per GPU (TFLOP/s/GPU): 66.8 | learning rate: 5.540100E-07 | global batch size:    64 | lm loss: 6.367390E-01 | loss scale: 1.0 | grad norm: 0.986 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-28 10:02:37] iteration      810/    1000 | consumed samples:        51840 | elapsed time per iteration (ms): 95696.8 | throughput per GPU (TFLOP/s/GPU): 80.6 | learning rate: 5.494190E-07 | global batch size:    64 | lm loss: 7.794733E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 10:04:12] iteration      811/    1000 | consumed samples:        51904 | elapsed time per iteration (ms): 94858.7 | throughput per GPU (TFLOP/s/GPU): 81.3 | learning rate: 5.448490E-07 | global batch size:    64 | lm loss: 6.537660E-01 | loss scale: 1.0 | grad norm: 0.741 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
 [2024-11-28 10:05:33] iteration      812/    1000 | consumed samples:        51968 | elapsed time per iteration (ms): 81180.0 | throughput per GPU (TFLOP/s/GPU): 95.0 | learning rate: 5.403001E-07 | global batch size:    64 | lm loss: 6.626501E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-28 10:06:58] iteration      813/    1000 | consumed samples:        52032 | elapsed time per iteration (ms): 84762.2 | throughput per GPU (TFLOP/s/GPU): 90.9 | learning rate: 5.357722E-07 | global batch size:    64 | lm loss: 6.666785E-01 | loss scale: 1.0 | grad norm: 3.573 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 10:08:18] iteration      814/    1000 | consumed samples:        52096 | elapsed time per iteration (ms): 79762.7 | throughput per GPU (TFLOP/s/GPU): 96.6 | learning rate: 5.312654E-07 | global batch size:    64 | lm loss: 6.747634E-01 | loss scale: 1.0 | grad norm: 0.833 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 10:09:25] iteration      815/    1000 | consumed samples:        52160 | elapsed time per iteration (ms): 67840.9 | throughput per GPU (TFLOP/s/GPU): 113.6 | learning rate: 5.267799E-07 | global batch size:    64 | lm loss: 6.878562E-01 | loss scale: 1.0 | grad norm: 1.202 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
 [2024-11-28 10:10:59] iteration      816/    1000 | consumed samples:        52224 | elapsed time per iteration (ms): 93363.9 | throughput per GPU (TFLOP/s/GPU): 82.6 | learning rate: 5.223155E-07 | global batch size:    64 | lm loss: 6.529773E-01 | loss scale: 1.0 | grad norm: 1.001 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dee792700] mmco: unref short failure
 [2024-11-28 10:12:22] iteration      817/    1000 | consumed samples:        52288 | elapsed time per iteration (ms): 83678.3 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 5.178724E-07 | global batch size:    64 | lm loss: 7.590072E-01 | loss scale: 1.0 | grad norm: 0.930 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
 [2024-11-28 10:13:31] iteration      818/    1000 | consumed samples:        52352 | elapsed time per iteration (ms): 68109.4 | throughput per GPU (TFLOP/s/GPU): 113.2 | learning rate: 5.134507E-07 | global batch size:    64 | lm loss: 7.248871E-01 | loss scale: 1.0 | grad norm: 0.877 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-28 10:15:02] iteration      819/    1000 | consumed samples:        52416 | elapsed time per iteration (ms): 91126.7 | throughput per GPU (TFLOP/s/GPU): 84.6 | learning rate: 5.090503E-07 | global batch size:    64 | lm loss: 6.618973E-01 | loss scale: 1.0 | grad norm: 0.951 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 10:16:25] iteration      820/    1000 | consumed samples:        52480 | elapsed time per iteration (ms): 83615.0 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 5.046713E-07 | global batch size:    64 | lm loss: 6.375130E-01 | loss scale: 1.0 | grad norm: 0.770 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 10:18:24] iteration      821/    1000 | consumed samples:        52544 | elapsed time per iteration (ms): 118406.0 | throughput per GPU (TFLOP/s/GPU): 65.1 | learning rate: 5.003137E-07 | global batch size:    64 | lm loss: 6.340280E-01 | loss scale: 1.0 | grad norm: 0.794 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 10:19:49] iteration      822/    1000 | consumed samples:        52608 | elapsed time per iteration (ms): 85555.0 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 4.959777E-07 | global batch size:    64 | lm loss: 6.423296E-01 | loss scale: 1.0 | grad norm: 0.895 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 10:21:05] iteration      823/    1000 | consumed samples:        52672 | elapsed time per iteration (ms): 75694.4 | throughput per GPU (TFLOP/s/GPU): 101.8 | learning rate: 4.916632E-07 | global batch size:    64 | lm loss: 6.706414E-01 | loss scale: 1.0 | grad norm: 0.728 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-28 10:22:34] iteration      824/    1000 | consumed samples:        52736 | elapsed time per iteration (ms): 88878.5 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 4.873703E-07 | global batch size:    64 | lm loss: 6.234448E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 10:23:49] iteration      825/    1000 | consumed samples:        52800 | elapsed time per iteration (ms): 75189.4 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 4.830990E-07 | global batch size:    64 | lm loss: 6.897532E-01 | loss scale: 1.0 | grad norm: 0.922 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
 [2024-11-28 10:25:16] iteration      826/    1000 | consumed samples:        52864 | elapsed time per iteration (ms): 87247.9 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 4.788494E-07 | global batch size:    64 | lm loss: 7.539995E-01 | loss scale: 1.0 | grad norm: 0.943 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 10:26:25] iteration      827/    1000 | consumed samples:        52928 | elapsed time per iteration (ms): 68636.7 | throughput per GPU (TFLOP/s/GPU): 112.3 | learning rate: 4.746216E-07 | global batch size:    64 | lm loss: 5.952159E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
 [2024-11-28 10:28:45] iteration      828/    1000 | consumed samples:        52992 | elapsed time per iteration (ms): 140003.7 | throughput per GPU (TFLOP/s/GPU): 55.1 | learning rate: 4.704155E-07 | global batch size:    64 | lm loss: 6.752132E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x555defa21280] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
[h264 @ 0x55d956fbbd80] mmco: unref short failure
 [2024-11-28 10:30:20] iteration      829/    1000 | consumed samples:        53056 | elapsed time per iteration (ms): 95171.1 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 4.662312E-07 | global batch size:    64 | lm loss: 6.471357E-01 | loss scale: 1.0 | grad norm: 0.823 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d956f537c0] mmco: unref short failure
 [2024-11-28 10:31:46] iteration      830/    1000 | consumed samples:        53120 | elapsed time per iteration (ms): 85230.2 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 4.620688E-07 | global batch size:    64 | lm loss: 5.844744E-01 | loss scale: 1.0 | grad norm: 0.841 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 3000 unjoint_samples 3000 joint_samples 205 [127422, 25206]
processed_samples 3000 unjoint_samples 3000 joint_samples 205 [127422, 25206]
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
processed_samples 3000 unjoint_samples 3000 joint_samples 210 [115831, 29036]
processed_samples 3000 unjoint_samples 3000 joint_samples 210 [115831, 29036]
processed_samples 3000 unjoint_samples 3000 joint_samples 204 [124064, 55170]
processed_samples 3000 unjoint_samples 3000 joint_samples 204 [124064, 55170]
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
processed_samples 3000 unjoint_samples 3000 joint_samples 210 [109963, 100082]
processed_samples 3000 unjoint_samples 3000 joint_samples 210 [109963, 100082]
processed_samples 3000 unjoint_samples 3000 joint_samples 215 [95111, 126828]
processed_samples 3000 unjoint_samples 3000 joint_samples 209 [116530, 129996]
processed_samples 3000 unjoint_samples 3000 joint_samples 215 [95111, 126828]
processed_samples 3000 unjoint_samples 3000 joint_samples 209 [116530, 129996]
processed_samples 3000 unjoint_samples 3000 joint_samples 207 [120032, 17059]
processed_samples 3000 unjoint_samples 3000 joint_samples 207 [120032, 17059]
 [2024-11-28 10:32:55] iteration      831/    1000 | consumed samples:        53184 | elapsed time per iteration (ms): 69294.3 | throughput per GPU (TFLOP/s/GPU): 111.2 | learning rate: 4.579283E-07 | global batch size:    64 | lm loss: 7.035551E-01 | loss scale: 1.0 | grad norm: 1.105 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
processed_samples 3001 unjoint_samples 3000 joint_samples 204 [96327, 114487]
processed_samples 3001 unjoint_samples 3000 joint_samples 204 [96327, 114487]
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x555dec9d4800] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 10:34:11] iteration      832/    1000 | consumed samples:        53248 | elapsed time per iteration (ms): 76534.5 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 4.538097E-07 | global batch size:    64 | lm loss: 7.037050E-01 | loss scale: 1.0 | grad norm: 0.927 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 10:35:35] iteration      833/    1000 | consumed samples:        53312 | elapsed time per iteration (ms): 83738.5 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 4.497131E-07 | global batch size:    64 | lm loss: 6.761808E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-28 10:36:49] iteration      834/    1000 | consumed samples:        53376 | elapsed time per iteration (ms): 73376.7 | throughput per GPU (TFLOP/s/GPU): 105.1 | learning rate: 4.456385E-07 | global batch size:    64 | lm loss: 7.318485E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 10:38:26] iteration      835/    1000 | consumed samples:        53440 | elapsed time per iteration (ms): 97012.1 | throughput per GPU (TFLOP/s/GPU): 79.5 | learning rate: 4.415861E-07 | global batch size:    64 | lm loss: 6.641833E-01 | loss scale: 1.0 | grad norm: 0.998 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-28 10:39:44] iteration      836/    1000 | consumed samples:        53504 | elapsed time per iteration (ms): 78840.2 | throughput per GPU (TFLOP/s/GPU): 97.8 | learning rate: 4.375557E-07 | global batch size:    64 | lm loss: 6.187526E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 10:41:09] iteration      837/    1000 | consumed samples:        53568 | elapsed time per iteration (ms): 84164.7 | throughput per GPU (TFLOP/s/GPU): 91.6 | learning rate: 4.335475E-07 | global batch size:    64 | lm loss: 7.115659E-01 | loss scale: 1.0 | grad norm: 0.869 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
 [2024-11-28 10:42:43] iteration      838/    1000 | consumed samples:        53632 | elapsed time per iteration (ms): 94853.6 | throughput per GPU (TFLOP/s/GPU): 81.3 | learning rate: 4.295615E-07 | global batch size:    64 | lm loss: 7.126722E-01 | loss scale: 1.0 | grad norm: 1.005 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 10:44:07] iteration      839/    1000 | consumed samples:        53696 | elapsed time per iteration (ms): 83725.2 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 4.255977E-07 | global batch size:    64 | lm loss: 6.734149E-01 | loss scale: 1.0 | grad norm: 0.770 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
 [2024-11-28 10:45:38] iteration      840/    1000 | consumed samples:        53760 | elapsed time per iteration (ms): 91050.7 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 4.216562E-07 | global batch size:    64 | lm loss: 6.599411E-01 | loss scale: 1.0 | grad norm: 1.389 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 10:47:12] iteration      841/    1000 | consumed samples:        53824 | elapsed time per iteration (ms): 93607.0 | throughput per GPU (TFLOP/s/GPU): 82.3 | learning rate: 4.177371E-07 | global batch size:    64 | lm loss: 6.965013E-01 | loss scale: 1.0 | grad norm: 1.028 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 10:48:31] iteration      842/    1000 | consumed samples:        53888 | elapsed time per iteration (ms): 79458.7 | throughput per GPU (TFLOP/s/GPU): 97.0 | learning rate: 4.138403E-07 | global batch size:    64 | lm loss: 6.894560E-01 | loss scale: 1.0 | grad norm: 1.232 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
[h264 @ 0x55d955ffc880] mmco: unref short failure
 [2024-11-28 10:49:57] iteration      843/    1000 | consumed samples:        53952 | elapsed time per iteration (ms): 85751.9 | throughput per GPU (TFLOP/s/GPU): 89.9 | learning rate: 4.099659E-07 | global batch size:    64 | lm loss: 6.982825E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 10:51:28] iteration      844/    1000 | consumed samples:        54016 | elapsed time per iteration (ms): 90570.1 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.061140E-07 | global batch size:    64 | lm loss: 6.635154E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 10:53:04] iteration      845/    1000 | consumed samples:        54080 | elapsed time per iteration (ms): 96328.2 | throughput per GPU (TFLOP/s/GPU): 80.0 | learning rate: 4.022845E-07 | global batch size:    64 | lm loss: 7.333788E-01 | loss scale: 1.0 | grad norm: 0.926 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 10:54:30] iteration      846/    1000 | consumed samples:        54144 | elapsed time per iteration (ms): 85899.4 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 3.984776E-07 | global batch size:    64 | lm loss: 7.403162E-01 | loss scale: 1.0 | grad norm: 1.022 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-28 10:55:51] iteration      847/    1000 | consumed samples:        54208 | elapsed time per iteration (ms): 81509.2 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 3.946933E-07 | global batch size:    64 | lm loss: 6.359328E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
 [2024-11-28 10:57:18] iteration      848/    1000 | consumed samples:        54272 | elapsed time per iteration (ms): 86457.1 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.909315E-07 | global batch size:    64 | lm loss: 6.534768E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 10:59:08] iteration      849/    1000 | consumed samples:        54336 | elapsed time per iteration (ms): 109584.1 | throughput per GPU (TFLOP/s/GPU): 70.3 | learning rate: 3.871925E-07 | global batch size:    64 | lm loss: 6.476867E-01 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 11:00:45] iteration      850/    1000 | consumed samples:        54400 | elapsed time per iteration (ms): 97565.7 | throughput per GPU (TFLOP/s/GPU): 79.0 | learning rate: 3.834760E-07 | global batch size:    64 | lm loss: 6.968692E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:02:06] iteration      851/    1000 | consumed samples:        54464 | elapsed time per iteration (ms): 81200.0 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 3.797824E-07 | global batch size:    64 | lm loss: 6.481150E-01 | loss scale: 1.0 | grad norm: 0.833 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 11:03:33] iteration      852/    1000 | consumed samples:        54528 | elapsed time per iteration (ms): 86955.3 | throughput per GPU (TFLOP/s/GPU): 88.6 | learning rate: 3.761115E-07 | global batch size:    64 | lm loss: 6.985618E-01 | loss scale: 1.0 | grad norm: 1.381 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
 [2024-11-28 11:05:07] iteration      853/    1000 | consumed samples:        54592 | elapsed time per iteration (ms): 93991.4 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 3.724633E-07 | global batch size:    64 | lm loss: 6.918560E-01 | loss scale: 1.0 | grad norm: 0.751 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-28 11:07:09] iteration      854/    1000 | consumed samples:        54656 | elapsed time per iteration (ms): 122071.7 | throughput per GPU (TFLOP/s/GPU): 63.1 | learning rate: 3.688381E-07 | global batch size:    64 | lm loss: 6.708304E-01 | loss scale: 1.0 | grad norm: 0.839 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
 [2024-11-28 11:08:48] iteration      855/    1000 | consumed samples:        54720 | elapsed time per iteration (ms): 99086.2 | throughput per GPU (TFLOP/s/GPU): 77.8 | learning rate: 3.652357E-07 | global batch size:    64 | lm loss: 5.969853E-01 | loss scale: 1.0 | grad norm: 1.076 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-28 11:10:39] iteration      856/    1000 | consumed samples:        54784 | elapsed time per iteration (ms): 110343.7 | throughput per GPU (TFLOP/s/GPU): 69.9 | learning rate: 3.616562E-07 | global batch size:    64 | lm loss: 6.009317E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:12:05] iteration      857/    1000 | consumed samples:        54848 | elapsed time per iteration (ms): 86422.8 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.580997E-07 | global batch size:    64 | lm loss: 7.241082E-01 | loss scale: 1.0 | grad norm: 1.027 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
 [2024-11-28 11:13:29] iteration      858/    1000 | consumed samples:        54912 | elapsed time per iteration (ms): 83739.2 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 3.545662E-07 | global batch size:    64 | lm loss: 6.354287E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-28 11:15:01] iteration      859/    1000 | consumed samples:        54976 | elapsed time per iteration (ms): 92087.2 | throughput per GPU (TFLOP/s/GPU): 83.7 | learning rate: 3.510557E-07 | global batch size:    64 | lm loss: 6.979592E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555dee6b8180] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
 [2024-11-28 11:16:17] iteration      860/    1000 | consumed samples:        55040 | elapsed time per iteration (ms): 75816.1 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 3.475682E-07 | global batch size:    64 | lm loss: 6.640246E-01 | loss scale: 1.0 | grad norm: 1.084 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:17:45] iteration      861/    1000 | consumed samples:        55104 | elapsed time per iteration (ms): 87869.5 | throughput per GPU (TFLOP/s/GPU): 87.7 | learning rate: 3.441039E-07 | global batch size:    64 | lm loss: 6.442217E-01 | loss scale: 1.0 | grad norm: 0.981 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 11:19:15] iteration      862/    1000 | consumed samples:        55168 | elapsed time per iteration (ms): 90642.3 | throughput per GPU (TFLOP/s/GPU): 85.0 | learning rate: 3.406627E-07 | global batch size:    64 | lm loss: 5.933395E-01 | loss scale: 1.0 | grad norm: 0.937 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:20:32] iteration      863/    1000 | consumed samples:        55232 | elapsed time per iteration (ms): 76873.5 | throughput per GPU (TFLOP/s/GPU): 100.3 | learning rate: 3.372447E-07 | global batch size:    64 | lm loss: 6.333019E-01 | loss scale: 1.0 | grad norm: 0.810 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 11:22:00] iteration      864/    1000 | consumed samples:        55296 | elapsed time per iteration (ms): 87690.3 | throughput per GPU (TFLOP/s/GPU): 87.9 | learning rate: 3.338499E-07 | global batch size:    64 | lm loss: 6.640263E-01 | loss scale: 1.0 | grad norm: 1.172 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x555ded0b5480] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 11:23:55] iteration      865/    1000 | consumed samples:        55360 | elapsed time per iteration (ms): 114569.4 | throughput per GPU (TFLOP/s/GPU): 67.3 | learning rate: 3.304783E-07 | global batch size:    64 | lm loss: 6.280389E-01 | loss scale: 1.0 | grad norm: 0.773 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:25:10] iteration      866/    1000 | consumed samples:        55424 | elapsed time per iteration (ms): 75815.6 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 3.271301E-07 | global batch size:    64 | lm loss: 7.477145E-01 | loss scale: 1.0 | grad norm: 0.872 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-28 11:26:37] iteration      867/    1000 | consumed samples:        55488 | elapsed time per iteration (ms): 86926.4 | throughput per GPU (TFLOP/s/GPU): 88.7 | learning rate: 3.238051E-07 | global batch size:    64 | lm loss: 6.348403E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
 [2024-11-28 11:28:47] iteration      868/    1000 | consumed samples:        55552 | elapsed time per iteration (ms): 129771.7 | throughput per GPU (TFLOP/s/GPU): 59.4 | learning rate: 3.205035E-07 | global batch size:    64 | lm loss: 6.754884E-01 | loss scale: 1.0 | grad norm: 0.841 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
 [2024-11-28 11:30:11] iteration      869/    1000 | consumed samples:        55616 | elapsed time per iteration (ms): 83466.8 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 3.172253E-07 | global batch size:    64 | lm loss: 6.710061E-01 | loss scale: 1.0 | grad norm: 0.948 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:31:31] iteration      870/    1000 | consumed samples:        55680 | elapsed time per iteration (ms): 79923.1 | throughput per GPU (TFLOP/s/GPU): 96.4 | learning rate: 3.139705E-07 | global batch size:    64 | lm loss: 6.762735E-01 | loss scale: 1.0 | grad norm: 2.749 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:32:46] iteration      871/    1000 | consumed samples:        55744 | elapsed time per iteration (ms): 75679.4 | throughput per GPU (TFLOP/s/GPU): 101.9 | learning rate: 3.107391E-07 | global batch size:    64 | lm loss: 6.117524E-01 | loss scale: 1.0 | grad norm: 0.852 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:34:09] iteration      872/    1000 | consumed samples:        55808 | elapsed time per iteration (ms): 82728.9 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 3.075313E-07 | global batch size:    64 | lm loss: 6.489487E-01 | loss scale: 1.0 | grad norm: 0.882 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
 [2024-11-28 11:35:54] iteration      873/    1000 | consumed samples:        55872 | elapsed time per iteration (ms): 104669.5 | throughput per GPU (TFLOP/s/GPU): 73.6 | learning rate: 3.043469E-07 | global batch size:    64 | lm loss: 6.838205E-01 | loss scale: 1.0 | grad norm: 0.993 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x555dedc5cf80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:37:20] iteration      874/    1000 | consumed samples:        55936 | elapsed time per iteration (ms): 86668.0 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 3.011862E-07 | global batch size:    64 | lm loss: 6.566857E-01 | loss scale: 1.0 | grad norm: 0.933 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:38:50] iteration      875/    1000 | consumed samples:        56000 | elapsed time per iteration (ms): 89306.3 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 2.980490E-07 | global batch size:    64 | lm loss: 6.576104E-01 | loss scale: 1.0 | grad norm: 0.993 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
 [2024-11-28 11:40:21] iteration      876/    1000 | consumed samples:        56064 | elapsed time per iteration (ms): 91021.0 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 2.949354E-07 | global batch size:    64 | lm loss: 6.744531E-01 | loss scale: 1.0 | grad norm: 0.759 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
 [2024-11-28 11:42:18] iteration      877/    1000 | consumed samples:        56128 | elapsed time per iteration (ms): 117762.8 | throughput per GPU (TFLOP/s/GPU): 65.5 | learning rate: 2.918455E-07 | global batch size:    64 | lm loss: 6.744663E-01 | loss scale: 1.0 | grad norm: 1.002 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
 [2024-11-28 11:43:58] iteration      878/    1000 | consumed samples:        56192 | elapsed time per iteration (ms): 99548.2 | throughput per GPU (TFLOP/s/GPU): 77.4 | learning rate: 2.887793E-07 | global batch size:    64 | lm loss: 6.407770E-01 | loss scale: 1.0 | grad norm: 0.719 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
 [2024-11-28 11:45:28] iteration      879/    1000 | consumed samples:        56256 | elapsed time per iteration (ms): 89613.4 | throughput per GPU (TFLOP/s/GPU): 86.0 | learning rate: 2.857367E-07 | global batch size:    64 | lm loss: 7.363208E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x555df3e6f800] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
 [2024-11-28 11:47:22] iteration      880/    1000 | consumed samples:        56320 | elapsed time per iteration (ms): 114021.1 | throughput per GPU (TFLOP/s/GPU): 67.6 | learning rate: 2.827180E-07 | global batch size:    64 | lm loss: 5.785704E-01 | loss scale: 1.0 | grad norm: 0.735 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-28 11:48:44] iteration      881/    1000 | consumed samples:        56384 | elapsed time per iteration (ms): 81971.8 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 2.797230E-07 | global batch size:    64 | lm loss: 7.418721E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 11:50:13] iteration      882/    1000 | consumed samples:        56448 | elapsed time per iteration (ms): 89729.7 | throughput per GPU (TFLOP/s/GPU): 85.9 | learning rate: 2.767519E-07 | global batch size:    64 | lm loss: 7.065222E-01 | loss scale: 1.0 | grad norm: 0.932 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
 [2024-11-28 11:51:29] iteration      883/    1000 | consumed samples:        56512 | elapsed time per iteration (ms): 75093.3 | throughput per GPU (TFLOP/s/GPU): 102.7 | learning rate: 2.738045E-07 | global batch size:    64 | lm loss: 6.494349E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
 [2024-11-28 11:53:07] iteration      884/    1000 | consumed samples:        56576 | elapsed time per iteration (ms): 98685.4 | throughput per GPU (TFLOP/s/GPU): 78.1 | learning rate: 2.708811E-07 | global batch size:    64 | lm loss: 6.827214E-01 | loss scale: 1.0 | grad norm: 0.803 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 11:54:26] iteration      885/    1000 | consumed samples:        56640 | elapsed time per iteration (ms): 78916.8 | throughput per GPU (TFLOP/s/GPU): 97.7 | learning rate: 2.679816E-07 | global batch size:    64 | lm loss: 6.591750E-01 | loss scale: 1.0 | grad norm: 2.252 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555def251b40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 11:56:03] iteration      886/    1000 | consumed samples:        56704 | elapsed time per iteration (ms): 96961.1 | throughput per GPU (TFLOP/s/GPU): 79.5 | learning rate: 2.651060E-07 | global batch size:    64 | lm loss: 5.800773E-01 | loss scale: 1.0 | grad norm: 0.867 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee1d4480] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 11:57:57] iteration      887/    1000 | consumed samples:        56768 | elapsed time per iteration (ms): 113772.5 | throughput per GPU (TFLOP/s/GPU): 67.8 | learning rate: 2.622543E-07 | global batch size:    64 | lm loss: 6.355096E-01 | loss scale: 1.0 | grad norm: 1.576 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-28 11:59:32] iteration      888/    1000 | consumed samples:        56832 | elapsed time per iteration (ms): 95001.9 | throughput per GPU (TFLOP/s/GPU): 81.1 | learning rate: 2.594267E-07 | global batch size:    64 | lm loss: 6.524200E-01 | loss scale: 1.0 | grad norm: 0.843 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-28 12:01:10] iteration      889/    1000 | consumed samples:        56896 | elapsed time per iteration (ms): 98289.0 | throughput per GPU (TFLOP/s/GPU): 78.4 | learning rate: 2.566231E-07 | global batch size:    64 | lm loss: 6.437273E-01 | loss scale: 1.0 | grad norm: 1.347 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 12:02:49] iteration      890/    1000 | consumed samples:        56960 | elapsed time per iteration (ms): 99010.1 | throughput per GPU (TFLOP/s/GPU): 77.9 | learning rate: 2.538436E-07 | global batch size:    64 | lm loss: 6.941308E-01 | loss scale: 1.0 | grad norm: 0.917 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 12:05:10] iteration      891/    1000 | consumed samples:        57024 | elapsed time per iteration (ms): 140404.7 | throughput per GPU (TFLOP/s/GPU): 54.9 | learning rate: 2.510881E-07 | global batch size:    64 | lm loss: 6.219099E-01 | loss scale: 1.0 | grad norm: 1.027 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
 [2024-11-28 12:06:48] iteration      892/    1000 | consumed samples:        57088 | elapsed time per iteration (ms): 98149.2 | throughput per GPU (TFLOP/s/GPU): 78.5 | learning rate: 2.483568E-07 | global batch size:    64 | lm loss: 6.992269E-01 | loss scale: 1.0 | grad norm: 1.027 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955d53340] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
 [2024-11-28 12:08:06] iteration      893/    1000 | consumed samples:        57152 | elapsed time per iteration (ms): 77768.3 | throughput per GPU (TFLOP/s/GPU): 99.1 | learning rate: 2.456496E-07 | global batch size:    64 | lm loss: 6.428531E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 12:09:34] iteration      894/    1000 | consumed samples:        57216 | elapsed time per iteration (ms): 88494.0 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 2.429665E-07 | global batch size:    64 | lm loss: 6.883608E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 12:11:09] iteration      895/    1000 | consumed samples:        57280 | elapsed time per iteration (ms): 95126.9 | throughput per GPU (TFLOP/s/GPU): 81.0 | learning rate: 2.403077E-07 | global batch size:    64 | lm loss: 6.512347E-01 | loss scale: 1.0 | grad norm: 0.857 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
 [2024-11-28 12:12:38] iteration      896/    1000 | consumed samples:        57344 | elapsed time per iteration (ms): 88535.9 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 2.376731E-07 | global batch size:    64 | lm loss: 5.932364E-01 | loss scale: 1.0 | grad norm: 0.981 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 12:13:56] iteration      897/    1000 | consumed samples:        57408 | elapsed time per iteration (ms): 78513.5 | throughput per GPU (TFLOP/s/GPU): 98.2 | learning rate: 2.350628E-07 | global batch size:    64 | lm loss: 6.852360E-01 | loss scale: 1.0 | grad norm: 1.502 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 12:15:22] iteration      898/    1000 | consumed samples:        57472 | elapsed time per iteration (ms): 85905.6 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 2.324767E-07 | global batch size:    64 | lm loss: 6.652462E-01 | loss scale: 1.0 | grad norm: 1.168 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959cd93c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-28 12:17:05] iteration      899/    1000 | consumed samples:        57536 | elapsed time per iteration (ms): 102585.9 | throughput per GPU (TFLOP/s/GPU): 75.1 | learning rate: 2.299149E-07 | global batch size:    64 | lm loss: 6.818770E-01 | loss scale: 1.0 | grad norm: 0.756 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 12:18:22] iteration      900/    1000 | consumed samples:        57600 | elapsed time per iteration (ms): 77580.8 | throughput per GPU (TFLOP/s/GPU): 99.4 | learning rate: 2.273775E-07 | global batch size:    64 | lm loss: 6.114725E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (267077.58, 267077.97)
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
 [2024-11-28 12:24:05] iteration      901/    1000 | consumed samples:        57664 | elapsed time per iteration (ms): 75885.6 | throughput per GPU (TFLOP/s/GPU): 101.6 | learning rate: 2.248645E-07 | global batch size:    64 | lm loss: 6.980703E-01 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 12:25:27] iteration      902/    1000 | consumed samples:        57728 | elapsed time per iteration (ms): 81488.6 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 2.223758E-07 | global batch size:    64 | lm loss: 7.051459E-01 | loss scale: 1.0 | grad norm: 0.884 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 12:26:51] iteration      903/    1000 | consumed samples:        57792 | elapsed time per iteration (ms): 84522.0 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 2.199115E-07 | global batch size:    64 | lm loss: 6.813552E-01 | loss scale: 1.0 | grad norm: 1.164 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
 [2024-11-28 12:28:06] iteration      904/    1000 | consumed samples:        57856 | elapsed time per iteration (ms): 74846.7 | throughput per GPU (TFLOP/s/GPU): 103.0 | learning rate: 2.174717E-07 | global batch size:    64 | lm loss: 6.666945E-01 | loss scale: 1.0 | grad norm: 0.965 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 12:29:16] iteration      905/    1000 | consumed samples:        57920 | elapsed time per iteration (ms): 69661.1 | throughput per GPU (TFLOP/s/GPU): 110.7 | learning rate: 2.150564E-07 | global batch size:    64 | lm loss: 6.282266E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 12:30:29] iteration      906/    1000 | consumed samples:        57984 | elapsed time per iteration (ms): 72981.1 | throughput per GPU (TFLOP/s/GPU): 105.6 | learning rate: 2.126655E-07 | global batch size:    64 | lm loss: 6.620821E-01 | loss scale: 1.0 | grad norm: 0.951 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dedacf800] mmco: unref short failure
[h264 @ 0x555dedacf800] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
 [2024-11-28 12:31:51] iteration      907/    1000 | consumed samples:        58048 | elapsed time per iteration (ms): 82389.5 | throughput per GPU (TFLOP/s/GPU): 93.6 | learning rate: 2.102992E-07 | global batch size:    64 | lm loss: 6.396784E-01 | loss scale: 1.0 | grad norm: 1.041 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 12:35:14] iteration      908/    1000 | consumed samples:        58112 | elapsed time per iteration (ms): 202675.5 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 2.079574E-07 | global batch size:    64 | lm loss: 6.770626E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
 [2024-11-28 12:36:40] iteration      909/    1000 | consumed samples:        58176 | elapsed time per iteration (ms): 86037.5 | throughput per GPU (TFLOP/s/GPU): 89.6 | learning rate: 2.056402E-07 | global batch size:    64 | lm loss: 6.487489E-01 | loss scale: 1.0 | grad norm: 0.883 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 12:38:12] iteration      910/    1000 | consumed samples:        58240 | elapsed time per iteration (ms): 91373.7 | throughput per GPU (TFLOP/s/GPU): 84.4 | learning rate: 2.033476E-07 | global batch size:    64 | lm loss: 7.297677E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
 [2024-11-28 12:39:31] iteration      911/    1000 | consumed samples:        58304 | elapsed time per iteration (ms): 79254.7 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 2.010795E-07 | global batch size:    64 | lm loss: 7.363273E-01 | loss scale: 1.0 | grad norm: 0.994 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee0da840] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee0da840] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee0da840] mmco: unref short failure
[h264 @ 0x555dee0da840] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee0da840] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
 [2024-11-28 12:41:12] iteration      912/    1000 | consumed samples:        58368 | elapsed time per iteration (ms): 100778.2 | throughput per GPU (TFLOP/s/GPU): 76.5 | learning rate: 1.988362E-07 | global batch size:    64 | lm loss: 6.452677E-01 | loss scale: 1.0 | grad norm: 1.218 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x55d959911280] mmco: unref short failure
[h264 @ 0x55d956f78600] mmco: unref short failure
[h264 @ 0x555dee0da840] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
 [2024-11-28 12:42:40] iteration      913/    1000 | consumed samples:        58432 | elapsed time per iteration (ms): 88495.8 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 1.966174E-07 | global batch size:    64 | lm loss: 7.108324E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d955ee69c0] mmco: unref short failure
[h264 @ 0x55d955ee69c0] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x555decea5e80] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d95d112a80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x55d956b9c3c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
 [2024-11-28 12:43:57] iteration      914/    1000 | consumed samples:        58496 | elapsed time per iteration (ms): 76541.2 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 1.944234E-07 | global batch size:    64 | lm loss: 7.447613E-01 | loss scale: 1.0 | grad norm: 0.866 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 12:45:20] iteration      915/    1000 | consumed samples:        58560 | elapsed time per iteration (ms): 83460.2 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 1.922541E-07 | global batch size:    64 | lm loss: 6.993719E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
 [2024-11-28 12:46:44] iteration      916/    1000 | consumed samples:        58624 | elapsed time per iteration (ms): 83996.5 | throughput per GPU (TFLOP/s/GPU): 91.8 | learning rate: 1.901095E-07 | global batch size:    64 | lm loss: 6.727135E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dec64f880] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dec64f880] mmco: unref short failure
[h264 @ 0x555dec64f880] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555dec64f880] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 12:48:29] iteration      917/    1000 | consumed samples:        58688 | elapsed time per iteration (ms): 104791.6 | throughput per GPU (TFLOP/s/GPU): 73.6 | learning rate: 1.879897E-07 | global batch size:    64 | lm loss: 6.296797E-01 | loss scale: 1.0 | grad norm: 0.687 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
 [2024-11-28 12:49:50] iteration      918/    1000 | consumed samples:        58752 | elapsed time per iteration (ms): 80806.4 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 1.858946E-07 | global batch size:    64 | lm loss: 6.743851E-01 | loss scale: 1.0 | grad norm: 0.817 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 12:51:03] iteration      919/    1000 | consumed samples:        58816 | elapsed time per iteration (ms): 73359.1 | throughput per GPU (TFLOP/s/GPU): 105.1 | learning rate: 1.838244E-07 | global batch size:    64 | lm loss: 6.623120E-01 | loss scale: 1.0 | grad norm: 0.782 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 12:52:39] iteration      920/    1000 | consumed samples:        58880 | elapsed time per iteration (ms): 96155.9 | throughput per GPU (TFLOP/s/GPU): 80.2 | learning rate: 1.817789E-07 | global batch size:    64 | lm loss: 6.973625E-01 | loss scale: 1.0 | grad norm: 1.291 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 12:54:10] iteration      921/    1000 | consumed samples:        58944 | elapsed time per iteration (ms): 90212.1 | throughput per GPU (TFLOP/s/GPU): 85.4 | learning rate: 1.797583E-07 | global batch size:    64 | lm loss: 6.481697E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 12:55:30] iteration      922/    1000 | consumed samples:        59008 | elapsed time per iteration (ms): 80544.6 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 1.777626E-07 | global batch size:    64 | lm loss: 6.707134E-01 | loss scale: 1.0 | grad norm: 0.896 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555dee104680] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 12:56:50] iteration      923/    1000 | consumed samples:        59072 | elapsed time per iteration (ms): 79886.4 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 1.757917E-07 | global batch size:    64 | lm loss: 6.061962E-01 | loss scale: 1.0 | grad norm: 0.873 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 12:58:11] iteration      924/    1000 | consumed samples:        59136 | elapsed time per iteration (ms): 80945.2 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 1.738458E-07 | global batch size:    64 | lm loss: 6.526064E-01 | loss scale: 1.0 | grad norm: 1.038 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 12:59:34] iteration      925/    1000 | consumed samples:        59200 | elapsed time per iteration (ms): 83194.9 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 1.719248E-07 | global batch size:    64 | lm loss: 6.461999E-01 | loss scale: 1.0 | grad norm: 0.801 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555defd60880] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956d792c0] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 13:00:55] iteration      926/    1000 | consumed samples:        59264 | elapsed time per iteration (ms): 81298.6 | throughput per GPU (TFLOP/s/GPU): 94.8 | learning rate: 1.700287E-07 | global batch size:    64 | lm loss: 6.680710E-01 | loss scale: 1.0 | grad norm: 1.062 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 13:03:38] iteration      927/    1000 | consumed samples:        59328 | elapsed time per iteration (ms): 162383.2 | throughput per GPU (TFLOP/s/GPU): 47.5 | learning rate: 1.681576E-07 | global batch size:    64 | lm loss: 6.349767E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
 [2024-11-28 13:05:00] iteration      928/    1000 | consumed samples:        59392 | elapsed time per iteration (ms): 82192.0 | throughput per GPU (TFLOP/s/GPU): 93.8 | learning rate: 1.663114E-07 | global batch size:    64 | lm loss: 7.176995E-01 | loss scale: 1.0 | grad norm: 0.912 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 13:06:59] iteration      929/    1000 | consumed samples:        59456 | elapsed time per iteration (ms): 118896.2 | throughput per GPU (TFLOP/s/GPU): 64.8 | learning rate: 1.644903E-07 | global batch size:    64 | lm loss: 7.511212E-01 | loss scale: 1.0 | grad norm: 1.079 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x555def9a5ac0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
[h264 @ 0x55d95742d280] mmco: unref short failure
 [2024-11-28 13:09:17] iteration      930/    1000 | consumed samples:        59520 | elapsed time per iteration (ms): 137974.5 | throughput per GPU (TFLOP/s/GPU): 55.9 | learning rate: 1.626942E-07 | global batch size:    64 | lm loss: 6.085041E-01 | loss scale: 1.0 | grad norm: 1.197 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
 [2024-11-28 13:11:09] iteration      931/    1000 | consumed samples:        59584 | elapsed time per iteration (ms): 112172.9 | throughput per GPU (TFLOP/s/GPU): 68.7 | learning rate: 1.609232E-07 | global batch size:    64 | lm loss: 6.549844E-01 | loss scale: 1.0 | grad norm: 0.860 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 13:12:27] iteration      932/    1000 | consumed samples:        59648 | elapsed time per iteration (ms): 77561.0 | throughput per GPU (TFLOP/s/GPU): 99.4 | learning rate: 1.591772E-07 | global batch size:    64 | lm loss: 6.965685E-01 | loss scale: 1.0 | grad norm: 0.962 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-28 13:14:04] iteration      933/    1000 | consumed samples:        59712 | elapsed time per iteration (ms): 97431.5 | throughput per GPU (TFLOP/s/GPU): 79.1 | learning rate: 1.574562E-07 | global batch size:    64 | lm loss: 7.647378E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 13:15:25] iteration      934/    1000 | consumed samples:        59776 | elapsed time per iteration (ms): 81194.6 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 1.557604E-07 | global batch size:    64 | lm loss: 6.471760E-01 | loss scale: 1.0 | grad norm: 0.905 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x55d957f0f640] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dec5cd500] mmco: unref short failure
[h264 @ 0x555dec5cd500] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555dec5cd500] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x55d959b29f40] mmco: unref short failure
[h264 @ 0x555dec5cd500] mmco: unref short failure
[h264 @ 0x555dec5cd500] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555dedbd1740] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 13:16:41] iteration      935/    1000 | consumed samples:        59840 | elapsed time per iteration (ms): 75355.6 | throughput per GPU (TFLOP/s/GPU): 102.3 | learning rate: 1.540897E-07 | global batch size:    64 | lm loss: 6.300914E-01 | loss scale: 1.0 | grad norm: 0.981 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 13:19:26] iteration      936/    1000 | consumed samples:        59904 | elapsed time per iteration (ms): 164799.3 | throughput per GPU (TFLOP/s/GPU): 46.8 | learning rate: 1.524441E-07 | global batch size:    64 | lm loss: 6.399311E-01 | loss scale: 1.0 | grad norm: 0.872 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df333fac0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 13:21:03] iteration      937/    1000 | consumed samples:        59968 | elapsed time per iteration (ms): 97316.7 | throughput per GPU (TFLOP/s/GPU): 79.2 | learning rate: 1.508237E-07 | global batch size:    64 | lm loss: 6.114424E-01 | loss scale: 1.0 | grad norm: 0.949 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec73eb00] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
 [2024-11-28 13:22:34] iteration      938/    1000 | consumed samples:        60032 | elapsed time per iteration (ms): 90834.0 | throughput per GPU (TFLOP/s/GPU): 84.9 | learning rate: 1.492284E-07 | global batch size:    64 | lm loss: 7.345959E-01 | loss scale: 1.0 | grad norm: 1.041 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
 [2024-11-28 13:24:27] iteration      939/    1000 | consumed samples:        60096 | elapsed time per iteration (ms): 113150.1 | throughput per GPU (TFLOP/s/GPU): 68.1 | learning rate: 1.476583E-07 | global batch size:    64 | lm loss: 6.628835E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 13:25:49] iteration      940/    1000 | consumed samples:        60160 | elapsed time per iteration (ms): 81984.3 | throughput per GPU (TFLOP/s/GPU): 94.0 | learning rate: 1.461135E-07 | global batch size:    64 | lm loss: 6.852534E-01 | loss scale: 1.0 | grad norm: 0.761 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x55d956defa80] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dee6ec240] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-28 13:27:03] iteration      941/    1000 | consumed samples:        60224 | elapsed time per iteration (ms): 74320.8 | throughput per GPU (TFLOP/s/GPU): 103.7 | learning rate: 1.445938E-07 | global batch size:    64 | lm loss: 7.116451E-01 | loss scale: 1.0 | grad norm: 0.812 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dede47780] mmco: unref short failure
[h264 @ 0x555dede47780] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
 [2024-11-28 13:28:27] iteration      942/    1000 | consumed samples:        60288 | elapsed time per iteration (ms): 83413.5 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 1.430994E-07 | global batch size:    64 | lm loss: 6.267360E-01 | loss scale: 1.0 | grad norm: 0.787 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dede47780] mmco: unref short failure
[h264 @ 0x555dede47780] mmco: unref short failure
 [2024-11-28 13:29:47] iteration      943/    1000 | consumed samples:        60352 | elapsed time per iteration (ms): 80788.2 | throughput per GPU (TFLOP/s/GPU): 95.4 | learning rate: 1.416302E-07 | global batch size:    64 | lm loss: 6.856556E-01 | loss scale: 1.0 | grad norm: 0.914 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555dede47780] mmco: unref short failure
[h264 @ 0x555dede47780] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
 [2024-11-28 13:30:58] iteration      944/    1000 | consumed samples:        60416 | elapsed time per iteration (ms): 70071.0 | throughput per GPU (TFLOP/s/GPU): 110.0 | learning rate: 1.401863E-07 | global batch size:    64 | lm loss: 7.190543E-01 | loss scale: 1.0 | grad norm: 1.094 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
 [2024-11-28 13:32:32] iteration      945/    1000 | consumed samples:        60480 | elapsed time per iteration (ms): 94661.8 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 1.387676E-07 | global batch size:    64 | lm loss: 6.541150E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x555dedc46200] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
 [2024-11-28 13:33:54] iteration      946/    1000 | consumed samples:        60544 | elapsed time per iteration (ms): 81568.8 | throughput per GPU (TFLOP/s/GPU): 94.5 | learning rate: 1.373743E-07 | global batch size:    64 | lm loss: 6.668899E-01 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d9586ad200] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x555dece4d600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 13:35:17] iteration      947/    1000 | consumed samples:        60608 | elapsed time per iteration (ms): 83452.5 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 1.360062E-07 | global batch size:    64 | lm loss: 7.081062E-01 | loss scale: 1.0 | grad norm: 0.961 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
 [2024-11-28 13:36:37] iteration      948/    1000 | consumed samples:        60672 | elapsed time per iteration (ms): 79886.5 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 1.346635E-07 | global batch size:    64 | lm loss: 6.683935E-01 | loss scale: 1.0 | grad norm: 0.874 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
[h264 @ 0x55d95873d100] mmco: unref short failure
[h264 @ 0x555dee700e80] mmco: unref short failure
 [2024-11-28 13:37:56] iteration      949/    1000 | consumed samples:        60736 | elapsed time per iteration (ms): 79185.2 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 1.333461E-07 | global batch size:    64 | lm loss: 7.324390E-01 | loss scale: 1.0 | grad norm: 0.917 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
[h264 @ 0x55d957fc08c0] mmco: unref short failure
 [2024-11-28 13:40:22] iteration      950/    1000 | consumed samples:        60800 | elapsed time per iteration (ms): 145884.4 | throughput per GPU (TFLOP/s/GPU): 52.8 | learning rate: 1.320541E-07 | global batch size:    64 | lm loss: 7.290564E-01 | loss scale: 1.0 | grad norm: 0.872 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 13:42:08] iteration      951/    1000 | consumed samples:        60864 | elapsed time per iteration (ms): 105541.4 | throughput per GPU (TFLOP/s/GPU): 73.0 | learning rate: 1.307874E-07 | global batch size:    64 | lm loss: 6.607154E-01 | loss scale: 1.0 | grad norm: 0.780 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
 [2024-11-28 13:43:15] iteration      952/    1000 | consumed samples:        60928 | elapsed time per iteration (ms): 67102.5 | throughput per GPU (TFLOP/s/GPU): 114.9 | learning rate: 1.295461E-07 | global batch size:    64 | lm loss: 6.186213E-01 | loss scale: 1.0 | grad norm: 0.969 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
 [2024-11-28 13:44:39] iteration      953/    1000 | consumed samples:        60992 | elapsed time per iteration (ms): 83875.1 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 1.283302E-07 | global batch size:    64 | lm loss: 6.808333E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955b8bcc0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 13:45:46] iteration      954/    1000 | consumed samples:        61056 | elapsed time per iteration (ms): 67345.2 | throughput per GPU (TFLOP/s/GPU): 114.5 | learning rate: 1.271397E-07 | global batch size:    64 | lm loss: 6.142411E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 13:47:07] iteration      955/    1000 | consumed samples:        61120 | elapsed time per iteration (ms): 81067.4 | throughput per GPU (TFLOP/s/GPU): 95.1 | learning rate: 1.259746E-07 | global batch size:    64 | lm loss: 6.965019E-01 | loss scale: 1.0 | grad norm: 1.067 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
 [2024-11-28 13:48:25] iteration      956/    1000 | consumed samples:        61184 | elapsed time per iteration (ms): 77374.3 | throughput per GPU (TFLOP/s/GPU): 99.6 | learning rate: 1.248349E-07 | global batch size:    64 | lm loss: 7.084676E-01 | loss scale: 1.0 | grad norm: 1.119 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 13:49:44] iteration      957/    1000 | consumed samples:        61248 | elapsed time per iteration (ms): 79787.0 | throughput per GPU (TFLOP/s/GPU): 96.6 | learning rate: 1.237207E-07 | global batch size:    64 | lm loss: 6.091076E-01 | loss scale: 1.0 | grad norm: 0.837 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
 [2024-11-28 13:51:04] iteration      958/    1000 | consumed samples:        61312 | elapsed time per iteration (ms): 79308.8 | throughput per GPU (TFLOP/s/GPU): 97.2 | learning rate: 1.226319E-07 | global batch size:    64 | lm loss: 6.252887E-01 | loss scale: 1.0 | grad norm: 0.887 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d9560849c0] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
 [2024-11-28 13:52:42] iteration      959/    1000 | consumed samples:        61376 | elapsed time per iteration (ms): 98444.8 | throughput per GPU (TFLOP/s/GPU): 78.3 | learning rate: 1.215686E-07 | global batch size:    64 | lm loss: 6.871887E-01 | loss scale: 1.0 | grad norm: 0.990 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 13:54:30] iteration      960/    1000 | consumed samples:        61440 | elapsed time per iteration (ms): 108204.2 | throughput per GPU (TFLOP/s/GPU): 71.2 | learning rate: 1.205308E-07 | global batch size:    64 | lm loss: 6.405410E-01 | loss scale: 1.0 | grad norm: 0.826 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d9565f0700] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
 [2024-11-28 13:56:02] iteration      961/    1000 | consumed samples:        61504 | elapsed time per iteration (ms): 91685.8 | throughput per GPU (TFLOP/s/GPU): 84.1 | learning rate: 1.195184E-07 | global batch size:    64 | lm loss: 6.903030E-01 | loss scale: 1.0 | grad norm: 0.916 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
 [2024-11-28 13:57:42] iteration      962/    1000 | consumed samples:        61568 | elapsed time per iteration (ms): 100042.4 | throughput per GPU (TFLOP/s/GPU): 77.1 | learning rate: 1.185315E-07 | global batch size:    64 | lm loss: 6.939712E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
[h264 @ 0x555dee570b40] mmco: unref short failure
[h264 @ 0x55d955861580] mmco: unref short failure
 [2024-11-28 13:59:10] iteration      963/    1000 | consumed samples:        61632 | elapsed time per iteration (ms): 87425.2 | throughput per GPU (TFLOP/s/GPU): 88.2 | learning rate: 1.175702E-07 | global batch size:    64 | lm loss: 6.489189E-01 | loss scale: 1.0 | grad norm: 1.170 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
 [2024-11-28 14:00:35] iteration      964/    1000 | consumed samples:        61696 | elapsed time per iteration (ms): 85013.2 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 1.166343E-07 | global batch size:    64 | lm loss: 6.793079E-01 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x55d956b88f40] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d958e05fc0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d956f7b580] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
 [2024-11-28 14:02:34] iteration      965/    1000 | consumed samples:        61760 | elapsed time per iteration (ms): 119301.6 | throughput per GPU (TFLOP/s/GPU): 64.6 | learning rate: 1.157240E-07 | global batch size:    64 | lm loss: 5.998636E-01 | loss scale: 1.0 | grad norm: 0.835 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
 [2024-11-28 14:04:00] iteration      966/    1000 | consumed samples:        61824 | elapsed time per iteration (ms): 85603.7 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 1.148392E-07 | global batch size:    64 | lm loss: 5.948734E-01 | loss scale: 1.0 | grad norm: 1.641 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
 [2024-11-28 14:05:11] iteration      967/    1000 | consumed samples:        61888 | elapsed time per iteration (ms): 71636.9 | throughput per GPU (TFLOP/s/GPU): 107.6 | learning rate: 1.139800E-07 | global batch size:    64 | lm loss: 6.451741E-01 | loss scale: 1.0 | grad norm: 0.939 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 14:06:19] iteration      968/    1000 | consumed samples:        61952 | elapsed time per iteration (ms): 67761.0 | throughput per GPU (TFLOP/s/GPU): 113.8 | learning rate: 1.131463E-07 | global batch size:    64 | lm loss: 6.464132E-01 | loss scale: 1.0 | grad norm: 0.830 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x55d9593709c0] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d959b34680] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
processed_samples 3500 unjoint_samples 3500 joint_samples 240 [37535, 126039]
processed_samples 3500 unjoint_samples 3500 joint_samples 240 [37535, 126039]
[h264 @ 0x55d959d1ea40] mmco: unref short failure
processed_samples 3500 unjoint_samples 3500 joint_samples 243 [117828, 123942]
processed_samples 3500 unjoint_samples 3500 joint_samples 248 [61546, 125289]
[h264 @ 0x555ded7e1040] mmco: unref short failure
processed_samples 3500 unjoint_samples 3500 joint_samples 244 [124745, 128771]
processed_samples 3500 unjoint_samples 3500 joint_samples 243 [117828, 123942]
processed_samples 3500 unjoint_samples 3500 joint_samples 242 [127735, 101974]
processed_samples 3500 unjoint_samples 3500 joint_samples 248 [61546, 125289]
processed_samples 3500 unjoint_samples 3500 joint_samples 240 [117489, 97448]
processed_samples 3500 unjoint_samples 3500 joint_samples 243 [99233, 90783]
processed_samples 3500 unjoint_samples 3500 joint_samples 244 [124745, 128771]
processed_samples 3501 unjoint_samples 3500 joint_samples 236 [111699, 114901]
processed_samples 3500 unjoint_samples 3500 joint_samples 242 [127735, 101974]
processed_samples 3500 unjoint_samples 3500 joint_samples 240 [117489, 97448]
processed_samples 3500 unjoint_samples 3500 joint_samples 243 [99233, 90783]
processed_samples 3501 unjoint_samples 3500 joint_samples 236 [111699, 114901]
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee588f00] mmco: unref short failure
 [2024-11-28 14:07:46] iteration      969/    1000 | consumed samples:        62016 | elapsed time per iteration (ms): 87505.2 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 1.123382E-07 | global batch size:    64 | lm loss: 6.732799E-01 | loss scale: 1.0 | grad norm: 0.883 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
 [2024-11-28 14:09:42] iteration      970/    1000 | consumed samples:        62080 | elapsed time per iteration (ms): 115166.3 | throughput per GPU (TFLOP/s/GPU): 66.9 | learning rate: 1.115556E-07 | global batch size:    64 | lm loss: 6.146312E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x55d9567ebec0] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
 [2024-11-28 14:10:59] iteration      971/    1000 | consumed samples:        62144 | elapsed time per iteration (ms): 77734.4 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 1.107986E-07 | global batch size:    64 | lm loss: 6.231325E-01 | loss scale: 1.0 | grad norm: 0.734 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
 [2024-11-28 14:12:25] iteration      972/    1000 | consumed samples:        62208 | elapsed time per iteration (ms): 85280.0 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 1.100672E-07 | global batch size:    64 | lm loss: 6.697411E-01 | loss scale: 1.0 | grad norm: 0.819 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555dedf05880] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d956386f00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555deee52400] mmco: unref short failure
 [2024-11-28 14:13:45] iteration      973/    1000 | consumed samples:        62272 | elapsed time per iteration (ms): 80061.8 | throughput per GPU (TFLOP/s/GPU): 96.3 | learning rate: 1.093615E-07 | global batch size:    64 | lm loss: 6.537416E-01 | loss scale: 1.0 | grad norm: 0.809 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
 [2024-11-28 14:15:17] iteration      974/    1000 | consumed samples:        62336 | elapsed time per iteration (ms): 92646.4 | throughput per GPU (TFLOP/s/GPU): 83.2 | learning rate: 1.086813E-07 | global batch size:    64 | lm loss: 7.027911E-01 | loss scale: 1.0 | grad norm: 0.914 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
 [2024-11-28 14:16:52] iteration      975/    1000 | consumed samples:        62400 | elapsed time per iteration (ms): 94500.1 | throughput per GPU (TFLOP/s/GPU): 81.6 | learning rate: 1.080267E-07 | global batch size:    64 | lm loss: 7.508754E-01 | loss scale: 1.0 | grad norm: 1.033 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dee0f8a80] mmco: unref short failure
[h264 @ 0x55d956aae040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x555ded70ae00] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
 [2024-11-28 14:18:09] iteration      976/    1000 | consumed samples:        62464 | elapsed time per iteration (ms): 77501.4 | throughput per GPU (TFLOP/s/GPU): 99.5 | learning rate: 1.073977E-07 | global batch size:    64 | lm loss: 7.104969E-01 | loss scale: 1.0 | grad norm: 1.024 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 14:19:30] iteration      977/    1000 | consumed samples:        62528 | elapsed time per iteration (ms): 80497.2 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 1.067943E-07 | global batch size:    64 | lm loss: 6.504764E-01 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x555df32c1f40] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
 [2024-11-28 14:20:53] iteration      978/    1000 | consumed samples:        62592 | elapsed time per iteration (ms): 83270.3 | throughput per GPU (TFLOP/s/GPU): 92.6 | learning rate: 1.062166E-07 | global batch size:    64 | lm loss: 7.602927E-01 | loss scale: 1.0 | grad norm: 0.914 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x555dee0de680] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
[h264 @ 0x55d957fe7780] mmco: unref short failure
 [2024-11-28 14:22:28] iteration      979/    1000 | consumed samples:        62656 | elapsed time per iteration (ms): 94731.5 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 1.056645E-07 | global batch size:    64 | lm loss: 6.811559E-01 | loss scale: 1.0 | grad norm: 0.965 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d957cd6140] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 14:24:05] iteration      980/    1000 | consumed samples:        62720 | elapsed time per iteration (ms): 97338.2 | throughput per GPU (TFLOP/s/GPU): 79.2 | learning rate: 1.051381E-07 | global batch size:    64 | lm loss: 6.308494E-01 | loss scale: 1.0 | grad norm: 0.862 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 14:25:32] iteration      981/    1000 | consumed samples:        62784 | elapsed time per iteration (ms): 86780.9 | throughput per GPU (TFLOP/s/GPU): 88.8 | learning rate: 1.046373E-07 | global batch size:    64 | lm loss: 6.862647E-01 | loss scale: 1.0 | grad norm: 0.902 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d957a03080] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 14:27:22] iteration      982/    1000 | consumed samples:        62848 | elapsed time per iteration (ms): 110273.4 | throughput per GPU (TFLOP/s/GPU): 69.9 | learning rate: 1.041621E-07 | global batch size:    64 | lm loss: 6.260334E-01 | loss scale: 1.0 | grad norm: 0.784 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x55d955acf300] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec836c00] mmco: unref short failure
[h264 @ 0x55d95827cc40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d955d24640] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x555dedfc6580] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d959d1ea40] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x555dedb7a200] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
 [2024-11-28 14:28:41] iteration      983/    1000 | consumed samples:        62912 | elapsed time per iteration (ms): 78985.5 | throughput per GPU (TFLOP/s/GPU): 97.6 | learning rate: 1.037126E-07 | global batch size:    64 | lm loss: 6.394918E-01 | loss scale: 1.0 | grad norm: 0.794 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
 [2024-11-28 14:30:07] iteration      984/    1000 | consumed samples:        62976 | elapsed time per iteration (ms): 85222.2 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 1.032888E-07 | global batch size:    64 | lm loss: 6.519013E-01 | loss scale: 1.0 | grad norm: 0.713 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 14:31:21] iteration      985/    1000 | consumed samples:        63040 | elapsed time per iteration (ms): 74396.5 | throughput per GPU (TFLOP/s/GPU): 103.6 | learning rate: 1.028906E-07 | global batch size:    64 | lm loss: 6.667175E-01 | loss scale: 1.0 | grad norm: 0.862 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d9581bc7c0] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x55d956da5240] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
[h264 @ 0x555dec811980] mmco: unref short failure
 [2024-11-28 14:32:53] iteration      986/    1000 | consumed samples:        63104 | elapsed time per iteration (ms): 92158.6 | throughput per GPU (TFLOP/s/GPU): 83.6 | learning rate: 1.025181E-07 | global batch size:    64 | lm loss: 6.164773E-01 | loss scale: 1.0 | grad norm: 1.084 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d956f36e40] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
 [2024-11-28 14:34:18] iteration      987/    1000 | consumed samples:        63168 | elapsed time per iteration (ms): 85187.1 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 1.021713E-07 | global batch size:    64 | lm loss: 6.563728E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 14:35:51] iteration      988/    1000 | consumed samples:        63232 | elapsed time per iteration (ms): 92580.3 | throughput per GPU (TFLOP/s/GPU): 83.3 | learning rate: 1.018501E-07 | global batch size:    64 | lm loss: 6.570765E-01 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555dedf89ec0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x555df4169bc0] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d955d4f640] mmco: unref short failure
[h264 @ 0x55d9568d0900] mmco: unref short failure
[h264 @ 0x555dece9f180] mmco: unref short failure
 [2024-11-28 14:36:59] iteration      989/    1000 | consumed samples:        63296 | elapsed time per iteration (ms): 67976.0 | throughput per GPU (TFLOP/s/GPU): 113.4 | learning rate: 1.015546E-07 | global batch size:    64 | lm loss: 7.026603E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x55d95678a600] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555df4961e80] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555deda84c00] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956ee5780] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
 [2024-11-28 14:38:14] iteration      990/    1000 | consumed samples:        63360 | elapsed time per iteration (ms): 75326.0 | throughput per GPU (TFLOP/s/GPU): 102.3 | learning rate: 1.012849E-07 | global batch size:    64 | lm loss: 6.594824E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x55d95c76c1c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x555dec9973c0] mmco: unref short failure
[h264 @ 0x55d95b718d00] mmco: unref short failure
[h264 @ 0x555def216fc0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x555df1cb0600] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
[h264 @ 0x55d9569edbc0] mmco: unref short failure
 [2024-11-28 14:39:43] iteration      991/    1000 | consumed samples:        63424 | elapsed time per iteration (ms): 88711.4 | throughput per GPU (TFLOP/s/GPU): 86.9 | learning rate: 1.010408E-07 | global batch size:    64 | lm loss: 6.635240E-01 | loss scale: 1.0 | grad norm: 0.979 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555dec5a5140] mmco: unref short failure
[h264 @ 0x55d957fbfd40] mmco: unref short failure
 [2024-11-28 14:41:11] iteration      992/    1000 | consumed samples:        63488 | elapsed time per iteration (ms): 87459.0 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 1.008223E-07 | global batch size:    64 | lm loss: 7.460098E-01 | loss scale: 1.0 | grad norm: 0.901 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 14:42:23] iteration      993/    1000 | consumed samples:        63552 | elapsed time per iteration (ms): 72575.2 | throughput per GPU (TFLOP/s/GPU): 106.2 | learning rate: 1.006296E-07 | global batch size:    64 | lm loss: 6.943420E-01 | loss scale: 1.0 | grad norm: 0.979 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 14:43:46] iteration      994/    1000 | consumed samples:        63616 | elapsed time per iteration (ms): 83141.3 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 1.004626E-07 | global batch size:    64 | lm loss: 6.353670E-01 | loss scale: 1.0 | grad norm: 0.996 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded7e1040] [h264 @ 0x55d95707f900] mmco: unref short failure
mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x55d956b4f800] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956f36840] mmco: unref short failure
 [2024-11-28 14:45:46] iteration      995/    1000 | consumed samples:        63680 | elapsed time per iteration (ms): 119455.2 | throughput per GPU (TFLOP/s/GPU): 64.5 | learning rate: 1.003212E-07 | global batch size:    64 | lm loss: 6.850454E-01 | loss scale: 1.0 | grad norm: 0.907 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d957883f80] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
 [2024-11-28 14:47:25] iteration      996/    1000 | consumed samples:        63744 | elapsed time per iteration (ms): 99541.9 | throughput per GPU (TFLOP/s/GPU): 77.4 | learning rate: 1.002056E-07 | global batch size:    64 | lm loss: 6.471719E-01 | loss scale: 1.0 | grad norm: 1.028 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 14:48:59] iteration      997/    1000 | consumed samples:        63808 | elapsed time per iteration (ms): 93832.3 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 1.001156E-07 | global batch size:    64 | lm loss: 6.529545E-01 | loss scale: 1.0 | grad norm: 1.344 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555dee100dc0] mmco: unref short failure
[h264 @ 0x55d956767080] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x555ded696c00] mmco: unref short failure
[h264 @ 0x55d956122340] mmco: unref short failure
[h264 @ 0x555ded7e1040] mmco: unref short failure
[h264 @ 0x55d95707f900] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
[h264 @ 0x55d955e4acc0] mmco: unref short failure
[h264 @ 0x555decb36640] mmco: unref short failure
 [2024-11-28 14:50:35] iteration      998/    1000 | consumed samples:        63872 | elapsed time per iteration (ms): 96048.8 | throughput per GPU (TFLOP/s/GPU): 80.3 | learning rate: 1.000514E-07 | global batch size:    64 | lm loss: 6.684725E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 14:52:26] iteration      999/    1000 | consumed samples:        63936 | elapsed time per iteration (ms): 110316.3 | throughput per GPU (TFLOP/s/GPU): 69.9 | learning rate: 1.000128E-07 | global batch size:    64 | lm loss: 6.331837E-01 | loss scale: 1.0 | grad norm: 0.849 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x555df0f7cd00] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x55d9559dfc40] mmco: unref short failure
[h264 @ 0x555ded679600] mmco: unref short failure
[h264 @ 0x55d9560aea40] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
[h264 @ 0x555df136f280] mmco: unref short failure
[h264 @ 0x55d956042040] mmco: unref short failure
 [2024-11-28 14:53:45] iteration     1000/    1000 | consumed samples:        64000 | elapsed time per iteration (ms): 79276.2 | throughput per GPU (TFLOP/s/GPU): 97.2 | learning rate: 1.000000E-07 | global batch size:    64 | lm loss: 7.160559E-01 | loss scale: 1.0 | grad norm: 1.057 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (274392.58, 274393.16)
[2024-11-28 15:04:24,072] torch.distributed.elastic.agent.server.api: [ERROR] Error waiting on exit barrier. Elapsed: 300.10606265068054 seconds
+ set +x