diff --git "a/stderr.log" "b/stderr.log" new file mode 100644--- /dev/null +++ "b/stderr.log" @@ -0,0 +1,579 @@ ++ deepspeed --master_port 51152 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/pos/10000/train.json --model_name_or_path /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-1T/tinyllama-1T-s3-Q1-10000 --log_type wandb --log_run_name imdb-tinyllama-1T-s3-Q1-10000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit +[rank5]:[W529 17:25:51.014365012 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[rank7]:[W529 17:25:51.021848873 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[rank6]:[W529 17:25:51.119147718 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[rank3]:[W529 17:25:52.805304718 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[rank0]:[W529 17:25:53.276718923 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[rank1]:[W529 17:25:53.276893358 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[rank4]:[W529 17:25:53.276897994 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +[rank2]:[W529 17:25:53.282328237 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/config.json +Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.52.1", + "use_cache": true, + "vocab_size": 32000 +} + +Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.52.1", + "use_cache": true, + "vocab_size": 32000 +} + +Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.52.1", + "use_cache": true, + "vocab_size": 32000 +} + +Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.52.1", + "use_cache": true, + "vocab_size": 32000 +} + +Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.52.1", + "use_cache": true, + "vocab_size": 32000 +} + +Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.52.1", + "use_cache": true, + "vocab_size": 32000 +} + +Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.52.1", + "use_cache": true, + "vocab_size": 32000 +} + +Model config LlamaConfig { + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.52.1", + "use_cache": true, + "vocab_size": 32000 +} + +loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/model.safetensors +loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/model.safetensors +loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/model.safetensors +loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/model.safetensors +loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/model.safetensors +loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/model.safetensors +loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/model.safetensors +loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/model.safetensors +Will use torch_dtype=torch.float32 as defined in model's config object +Instantiating LlamaForCausalLM model under default dtype torch.float32. +Will use torch_dtype=torch.float32 as defined in model's config object +Instantiating LlamaForCausalLM model under default dtype torch.float32. +Detected DeepSpeed ZeRO-3: activating zero.init() for this model +Will use torch_dtype=torch.float32 as defined in model's config object +Will use torch_dtype=torch.float32 as defined in model's config object +Detected DeepSpeed ZeRO-3: activating zero.init() for this model +Will use torch_dtype=torch.float32 as defined in model's config object +Will use torch_dtype=torch.float32 as defined in model's config object +Instantiating LlamaForCausalLM model under default dtype torch.float32. +Instantiating LlamaForCausalLM model under default dtype torch.float32. +Will use torch_dtype=torch.float32 as defined in model's config object +Instantiating LlamaForCausalLM model under default dtype torch.float32. +Instantiating LlamaForCausalLM model under default dtype torch.float32. +Instantiating LlamaForCausalLM model under default dtype torch.float32. +Detected DeepSpeed ZeRO-3: activating zero.init() for this model +Detected DeepSpeed ZeRO-3: activating zero.init() for this model +Detected DeepSpeed ZeRO-3: activating zero.init() for this model +Detected DeepSpeed ZeRO-3: activating zero.init() for this model +Detected DeepSpeed ZeRO-3: activating zero.init() for this model +Will use torch_dtype=torch.float32 as defined in model's config object +Instantiating LlamaForCausalLM model under default dtype torch.float32. +Detected DeepSpeed ZeRO-3: activating zero.init() for this model +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} + +All model checkpoint weights were used when initializing LlamaForCausalLM. + +All model checkpoint weights were used when initializing LlamaForCausalLM. + +All model checkpoint weights were used when initializing LlamaForCausalLM. + +All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +All model checkpoint weights were used when initializing LlamaForCausalLM. + +All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +All model checkpoint weights were used when initializing LlamaForCausalLM. + +All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +All model checkpoint weights were used when initializing LlamaForCausalLM. + +All model checkpoint weights were used when initializing LlamaForCausalLM. + +All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/generation_config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/generation_config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/generation_config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/generation_config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/generation_config.json +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/generation_config.json +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0 +} + +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/generation_config.json +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0 +} + +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0 +} + +loading file tokenizer.model +loading file tokenizer.json +loading file added_tokens.json +loading file special_tokens_map.json +loading file tokenizer_config.json +loading file chat_template.jinja +loading file tokenizer.model +loading file tokenizer.model +loading file tokenizer.model +loading file tokenizer.json +loading file tokenizer.json +loading file tokenizer.model +loading file tokenizer.json +loading file added_tokens.json +loading file special_tokens_map.json +loading file added_tokens.json +loading file tokenizer.json +loading file added_tokens.json +loading file special_tokens_map.json +loading file special_tokens_map.json +loading file tokenizer_config.json +loading file added_tokens.json +loading file tokenizer.model +loading file chat_template.jinja +loading file special_tokens_map.json +loading file tokenizer_config.json +loading file tokenizer_config.json +loading file chat_template.jinja +loading file tokenizer_config.json +loading file chat_template.jinja +loading file chat_template.jinja +loading file tokenizer.json +loading file added_tokens.json +loading file tokenizer.model +loading file tokenizer.json +loading file special_tokens_map.json +loading file tokenizer_config.json +loading file added_tokens.json +loading file chat_template.jinja +loading file special_tokens_map.json +loading file tokenizer_config.json +loading file chat_template.jinja +You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +All model checkpoint weights were used when initializing LlamaForCausalLM. + +All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-480k-1T/generation_config.json +Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0 +} + +loading file tokenizer.model +loading file tokenizer.json +loading file added_tokens.json +loading file special_tokens_map.json +loading file tokenizer_config.json +loading file chat_template.jinja +You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False` +Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... +Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... +Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... +Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... +Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... +Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... +Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... +Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... +/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module fused_adam... +Loading extension module fused_adam...Loading extension module fused_adam...Loading extension module fused_adam... + + +Loading extension module fused_adam... +Loading extension module fused_adam... +Loading extension module fused_adam... +Loading extension module fused_adam... +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.19.11 +wandb: Run data is saved locally in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-1T/tinyllama-1T-s3-Q1-10000/wandb/run-20250529_172629-0t1vq5hu +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run imdb-tinyllama-1T-s3-Q1-10000 +wandb: ⭐️ View project at https://wandb.ai/xtom/Inverse_Alignment_IMDb +wandb: 🚀 View run at https://wandb.ai/xtom/Inverse_Alignment_IMDb/runs/0t1vq5hu + Training 1/1 epoch: 0%| | 0/1250 [00:00.wrapper of > +Traceback (most recent call last): + File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close + self.wandb.finish() + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper + return func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 503, in wrapper + return func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 451, in wrapper + return func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2309, in finish + return self._finish(exit_code) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper + return func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2337, in _finish + self._atexit_cleanup(exit_code=exit_code) + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2550, in _atexit_cleanup + self._on_finish() + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2806, in _on_finish + wait_with_progress( + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress + return wait_all_with_progress( + ^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress + return asyncio_compat.run(progress_loop_with_timeout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run + future = executor.submit(runner.run, fn) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/concurrent/futures/thread.py", line 169, in submit + raise RuntimeError('cannot schedule new futures after ' +RuntimeError: cannot schedule new futures after interpreter shutdown