|
+ deepspeed --master_port 32163 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/1000/train.json --model_name_or_path /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000 --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000-Q2-1000 --log_type wandb --log_run_name imdb-tinyllama-3T-s3-Q1-1000-Q2-1000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
[rank6]:[W527 21:17:51.414382570 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank3]:[W527 21:17:51.414461553 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank2]:[W527 21:17:51.430257838 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank4]:[W527 21:17:51.458945198 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank7]:[W527 21:17:51.459625064 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank1]:[W527 21:17:51.477309608 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank5]:[W527 21:17:51.536670687 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank0]:[W527 21:17:52.662932258 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/config.json |
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/pytorch_model.bin |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/pytorch_model.bin |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/pytorch_model.bin |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000/pytorch_model.bin |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file added_tokens.json |
|
loading file added_tokens.json |
|
loading file tokenizer_config.json |
|
loading file special_tokens_map.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Detected CUDA files, patching ldflags |
|
Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... |
|
/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. |
|
If this is not desired, please set os.environ[ |
|
warnings.warn( |
|
Building extension module fused_adam... |
|
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin |
|
wandb: Tracking run with wandb version 0.19.11 |
|
wandb: Run data is saved locally in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000-Q2-1000/wandb/run-20250527_211808-il90uot9 |
|
wandb: Run `wandb offline` to turn off syncing. |
|
wandb: Syncing run imdb-tinyllama-3T-s3-Q1-1000-Q2-1000 |
|
wandb: βοΈ View project at https://wandb.ai/xtom/Inverse_Alignment_IMDb |
|
wandb: π View run at https://wandb.ai/xtom/Inverse_Alignment_IMDb/runs/il90uot9 |
|
Training 1/1 epoch: 0%| | 0/125 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
Training 1/1 epoch (loss 2.6787): 0%| | 0/125 [00:09<?, ?it/s]
Training 1/1 epoch (loss 2.6787): 1%| | 1/125 [00:09<19:18, 9.34s/it]
Training 1/1 epoch (loss 2.6385): 1%| | 1/125 [00:11<19:18, 9.34s/it]
Training 1/1 epoch (loss 2.6385): 2%|β | 2/125 [00:11<10:42, 5.22s/it]
Training 1/1 epoch (loss 2.6188): 2%|β | 2/125 [00:12<10:42, 5.22s/it]
Training 1/1 epoch (loss 2.6188): 2%|β | 3/125 [00:12<06:30, 3.20s/it]
Training 1/1 epoch (loss 2.5027): 2%|β | 3/125 [00:13<06:30, 3.20s/it]
Training 1/1 epoch (loss 2.5027): 3%|β | 4/125 [00:13<04:35, 2.28s/it]
Training 1/1 epoch (loss 2.5573): 3%|β | 4/125 [00:14<04:35, 2.28s/it]
Training 1/1 epoch (loss 2.5573): 4%|β | 5/125 [00:14<03:58, 1.99s/it]
Training 1/1 epoch (loss 2.7576): 4%|β | 5/125 [00:15<03:58, 1.99s/it]
Training 1/1 epoch (loss 2.7576): 5%|β | 6/125 [00:15<03:09, 1.59s/it]
Training 1/1 epoch (loss 2.7804): 5%|β | 6/125 [00:17<03:09, 1.59s/it]
Training 1/1 epoch (loss 2.7804): 6%|β | 7/125 [00:17<03:30, 1.79s/it]
Training 1/1 epoch (loss 2.7376): 6%|β | 7/125 [00:19<03:30, 1.79s/it]
Training 1/1 epoch (loss 2.7376): 6%|β | 8/125 [00:19<03:34, 1.84s/it]
Training 1/1 epoch (loss 2.6285): 6%|β | 8/125 [00:20<03:34, 1.84s/it]
Training 1/1 epoch (loss 2.6285): 7%|β | 9/125 [00:20<03:03, 1.58s/it]
Training 1/1 epoch (loss 2.6379): 7%|β | 9/125 [00:22<03:03, 1.58s/it]
Training 1/1 epoch (loss 2.6379): 8%|β | 10/125 [00:22<03:15, 1.70s/it]
Training 1/1 epoch (loss 2.7228): 8%|β | 10/125 [00:24<03:15, 1.70s/it]
Training 1/1 epoch (loss 2.7228): 9%|β | 11/125 [00:24<03:07, 1.64s/it]
Training 1/1 epoch (loss 2.6066): 9%|β | 11/125 [00:25<03:07, 1.64s/it]
Training 1/1 epoch (loss 2.6066): 10%|β | 12/125 [00:25<02:58, 1.58s/it]
Training 1/1 epoch (loss 2.7338): 10%|β | 12/125 [00:27<02:58, 1.58s/it]
Training 1/1 epoch (loss 2.7338): 10%|β | 13/125 [00:27<02:57, 1.58s/it]
Training 1/1 epoch (loss 2.4808): 10%|β | 13/125 [00:28<02:57, 1.58s/it]
Training 1/1 epoch (loss 2.4808): 11%|β | 14/125 [00:28<02:42, 1.47s/it]
Training 1/1 epoch (loss 2.7906): 11%|β | 14/125 [00:30<02:42, 1.47s/it]
Training 1/1 epoch (loss 2.7906): 12%|ββ | 15/125 [00:30<02:48, 1.53s/it]
Training 1/1 epoch (loss 2.5340): 12%|ββ | 15/125 [00:32<02:48, 1.53s/it]
Training 1/1 epoch (loss 2.5340): 13%|ββ | 16/125 [00:32<03:03, 1.68s/it]
Training 1/1 epoch (loss 2.5488): 13%|ββ | 16/125 [00:32<03:03, 1.68s/it]
Training 1/1 epoch (loss 2.5488): 14%|ββ | 17/125 [00:32<02:25, 1.34s/it]
Training 1/1 epoch (loss 2.8019): 14%|ββ | 17/125 [00:34<02:25, 1.34s/it]
Training 1/1 epoch (loss 2.8019): 14%|ββ | 18/125 [00:34<02:39, 1.49s/it]
Training 1/1 epoch (loss 2.7196): 14%|ββ | 18/125 [00:36<02:39, 1.49s/it]
Training 1/1 epoch (loss 2.7196): 15%|ββ | 19/125 [00:36<03:01, 1.71s/it]
Training 1/1 epoch (loss 2.7696): 15%|ββ | 19/125 [00:37<03:01, 1.71s/it]
Training 1/1 epoch (loss 2.7696): 16%|ββ | 20/125 [00:37<02:19, 1.33s/it]
Training 1/1 epoch (loss 2.6846): 16%|ββ | 20/125 [00:39<02:19, 1.33s/it]
Training 1/1 epoch (loss 2.6846): 17%|ββ | 21/125 [00:39<02:52, 1.66s/it]
Training 1/1 epoch (loss 2.5683): 17%|ββ | 21/125 [00:41<02:52, 1.66s/it]
Training 1/1 epoch (loss 2.5683): 18%|ββ | 22/125 [00:41<02:45, 1.61s/it]
Training 1/1 epoch (loss 2.5136): 18%|ββ | 22/125 [00:41<02:45, 1.61s/it]
Training 1/1 epoch (loss 2.5136): 18%|ββ | 23/125 [00:41<02:12, 1.30s/it]
Training 1/1 epoch (loss 2.5708): 18%|ββ | 23/125 [00:44<02:12, 1.30s/it]
Training 1/1 epoch (loss 2.5708): 19%|ββ | 24/125 [00:44<02:46, 1.65s/it]
Training 1/1 epoch (loss 2.5563): 19%|ββ | 24/125 [00:45<02:46, 1.65s/it]
Training 1/1 epoch (loss 2.5563): 20%|ββ | 25/125 [00:45<02:38, 1.58s/it]
Training 1/1 epoch (loss 2.5642): 20%|ββ | 25/125 [00:46<02:38, 1.58s/it]
Training 1/1 epoch (loss 2.5642): 21%|ββ | 26/125 [00:46<02:04, 1.25s/it]
Training 1/1 epoch (loss 2.4924): 21%|ββ | 26/125 [00:47<02:04, 1.25s/it]
Training 1/1 epoch (loss 2.4924): 22%|βββ | 27/125 [00:47<02:13, 1.36s/it]
Training 1/1 epoch (loss 2.5194): 22%|βββ | 27/125 [00:49<02:13, 1.36s/it]
Training 1/1 epoch (loss 2.5194): 22%|βββ | 28/125 [00:49<02:24, 1.49s/it]
Training 1/1 epoch (loss 2.7119): 22%|βββ | 28/125 [00:50<02:24, 1.49s/it]
Training 1/1 epoch (loss 2.7119): 23%|βββ | 29/125 [00:50<01:54, 1.19s/it]
Training 1/1 epoch (loss 2.4761): 23%|βββ | 29/125 [00:52<01:54, 1.19s/it]
Training 1/1 epoch (loss 2.4761): 24%|βββ | 30/125 [00:52<02:28, 1.57s/it]
Training 1/1 epoch (loss 2.8022): 24%|βββ | 30/125 [00:54<02:28, 1.57s/it]
Training 1/1 epoch (loss 2.8022): 25%|βββ | 31/125 [00:54<02:31, 1.62s/it]
Training 1/1 epoch (loss 2.5997): 25%|βββ | 31/125 [00:54<02:31, 1.62s/it]
Training 1/1 epoch (loss 2.5997): 26%|βββ | 32/125 [00:54<02:04, 1.34s/it]
Training 1/1 epoch (loss 2.8277): 26%|βββ | 32/125 [00:56<02:04, 1.34s/it]
Training 1/1 epoch (loss 2.8277): 26%|βββ | 33/125 [00:56<02:01, 1.32s/it]
Training 1/1 epoch (loss 2.6487): 26%|βββ | 33/125 [00:58<02:01, 1.32s/it]
Training 1/1 epoch (loss 2.6487): 27%|βββ | 34/125 [00:58<02:18, 1.52s/it]
Training 1/1 epoch (loss 2.7478): 27%|βββ | 34/125 [00:58<02:18, 1.52s/it]
Training 1/1 epoch (loss 2.7478): 28%|βββ | 35/125 [00:58<01:56, 1.30s/it]
Training 1/1 epoch (loss 2.7477): 28%|βββ | 35/125 [01:00<01:56, 1.30s/it]
Training 1/1 epoch (loss 2.7477): 29%|βββ | 36/125 [01:00<01:57, 1.32s/it]
Training 1/1 epoch (loss 2.5560): 29%|βββ | 36/125 [01:01<01:57, 1.32s/it]
Training 1/1 epoch (loss 2.5560): 30%|βββ | 37/125 [01:01<02:03, 1.41s/it]
Training 1/1 epoch (loss 2.5053): 30%|βββ | 37/125 [01:02<02:03, 1.41s/it]
Training 1/1 epoch (loss 2.5053): 30%|βββ | 38/125 [01:02<01:46, 1.22s/it]
Training 1/1 epoch (loss 2.5719): 30%|βββ | 38/125 [01:03<01:46, 1.22s/it]
Training 1/1 epoch (loss 2.5719): 31%|βββ | 39/125 [01:03<01:45, 1.22s/it]
Training 1/1 epoch (loss 2.5516): 31%|βββ | 39/125 [01:05<01:45, 1.22s/it]
Training 1/1 epoch (loss 2.5516): 32%|ββββ | 40/125 [01:05<01:43, 1.22s/it]
Training 1/1 epoch (loss 2.4498): 32%|ββββ | 40/125 [01:07<01:43, 1.22s/it]
Training 1/1 epoch (loss 2.4498): 33%|ββββ | 41/125 [01:07<02:12, 1.58s/it]
Training 1/1 epoch (loss 2.7829): 33%|ββββ | 41/125 [01:09<02:12, 1.58s/it]
Training 1/1 epoch (loss 2.7829): 34%|ββββ | 42/125 [01:09<02:21, 1.70s/it]
Training 1/1 epoch (loss 2.6869): 34%|ββββ | 42/125 [01:10<02:21, 1.70s/it]
Training 1/1 epoch (loss 2.6869): 34%|ββββ | 43/125 [01:10<01:54, 1.39s/it]
Training 1/1 epoch (loss 2.8398): 34%|ββββ | 43/125 [01:11<01:54, 1.39s/it]
Training 1/1 epoch (loss 2.8398): 35%|ββββ | 44/125 [01:11<02:01, 1.50s/it]
Training 1/1 epoch (loss 2.6436): 35%|ββββ | 44/125 [01:14<02:01, 1.50s/it]
Training 1/1 epoch (loss 2.6436): 36%|ββββ | 45/125 [01:14<02:19, 1.75s/it]
Training 1/1 epoch (loss 2.6917): 36%|ββββ | 45/125 [01:15<02:19, 1.75s/it]
Training 1/1 epoch (loss 2.6917): 37%|ββββ | 46/125 [01:15<01:56, 1.47s/it]
Training 1/1 epoch (loss 2.4213): 37%|ββββ | 46/125 [01:16<01:56, 1.47s/it]
Training 1/1 epoch (loss 2.4213): 38%|ββββ | 47/125 [01:16<02:04, 1.59s/it]
Training 1/1 epoch (loss 2.5612): 38%|ββββ | 47/125 [01:18<02:04, 1.59s/it]
Training 1/1 epoch (loss 2.5612): 38%|ββββ | 48/125 [01:18<02:10, 1.69s/it]
Training 1/1 epoch (loss 2.6904): 38%|ββββ | 48/125 [01:19<02:10, 1.69s/it]
Training 1/1 epoch (loss 2.6904): 39%|ββββ | 49/125 [01:19<01:49, 1.44s/it]
Training 1/1 epoch (loss 2.5419): 39%|ββββ | 49/125 [01:22<01:49, 1.44s/it]
Training 1/1 epoch (loss 2.5419): 40%|ββββ | 50/125 [01:22<02:07, 1.69s/it]
Training 1/1 epoch (loss 2.5467): 40%|ββββ | 50/125 [01:23<02:07, 1.69s/it]
Training 1/1 epoch (loss 2.5467): 41%|ββββ | 51/125 [01:23<02:03, 1.67s/it]
Training 1/1 epoch (loss 2.7235): 41%|ββββ | 51/125 [01:24<02:03, 1.67s/it]
Training 1/1 epoch (loss 2.7235): 42%|βββββ | 52/125 [01:24<01:38, 1.35s/it]
Training 1/1 epoch (loss 2.7026): 42%|βββββ | 52/125 [01:26<01:38, 1.35s/it]
Training 1/1 epoch (loss 2.7026): 42%|βββββ | 53/125 [01:26<01:52, 1.57s/it]
Training 1/1 epoch (loss 2.5135): 42%|βββββ | 53/125 [01:27<01:52, 1.57s/it]
Training 1/1 epoch (loss 2.5135): 43%|βββββ | 54/125 [01:27<01:50, 1.56s/it]
Training 1/1 epoch (loss 2.6462): 43%|βββββ | 54/125 [01:29<01:50, 1.56s/it]
Training 1/1 epoch (loss 2.6462): 44%|βββββ | 55/125 [01:29<01:43, 1.47s/it]
Training 1/1 epoch (loss 2.6640): 44%|βββββ | 55/125 [01:30<01:43, 1.47s/it]
Training 1/1 epoch (loss 2.6640): 45%|βββββ | 56/125 [01:30<01:47, 1.56s/it]
Training 1/1 epoch (loss 2.7460): 45%|βββββ | 56/125 [01:32<01:47, 1.56s/it]
Training 1/1 epoch (loss 2.7460): 46%|βββββ | 57/125 [01:32<01:38, 1.45s/it]
Training 1/1 epoch (loss 2.7604): 46%|βββββ | 57/125 [01:34<01:38, 1.45s/it]
Training 1/1 epoch (loss 2.7604): 46%|βββββ | 58/125 [01:34<01:54, 1.70s/it]
Training 1/1 epoch (loss 2.5636): 46%|βββββ | 58/125 [01:35<01:54, 1.70s/it]
Training 1/1 epoch (loss 2.5636): 47%|βββββ | 59/125 [01:35<01:47, 1.62s/it]
Training 1/1 epoch (loss 2.6526): 47%|βββββ | 59/125 [01:36<01:47, 1.62s/it]
Training 1/1 epoch (loss 2.6526): 48%|βββββ | 60/125 [01:36<01:34, 1.45s/it]
Training 1/1 epoch (loss 2.7197): 48%|βββββ | 60/125 [01:39<01:34, 1.45s/it]
Training 1/1 epoch (loss 2.7197): 49%|βββββ | 61/125 [01:39<01:53, 1.77s/it]
Training 1/1 epoch (loss 2.5013): 49%|βββββ | 61/125 [01:40<01:53, 1.77s/it]
Training 1/1 epoch (loss 2.5013): 50%|βββββ | 62/125 [01:40<01:44, 1.66s/it]
Training 1/1 epoch (loss 2.5541): 50%|βββββ | 62/125 [01:41<01:44, 1.66s/it]
Training 1/1 epoch (loss 2.5541): 50%|βββββ | 63/125 [01:41<01:22, 1.33s/it]
Training 1/1 epoch (loss 2.6177): 50%|βββββ | 63/125 [01:43<01:22, 1.33s/it]
Training 1/1 epoch (loss 2.6177): 51%|βββββ | 64/125 [01:43<01:35, 1.57s/it]
Training 1/1 epoch (loss 2.5751): 51%|βββββ | 64/125 [01:44<01:35, 1.57s/it]
Training 1/1 epoch (loss 2.5751): 52%|ββββββ | 65/125 [01:44<01:24, 1.40s/it]
Training 1/1 epoch (loss 2.6915): 52%|ββββββ | 65/125 [01:45<01:24, 1.40s/it]
Training 1/1 epoch (loss 2.6915): 53%|ββββββ | 66/125 [01:45<01:12, 1.23s/it]
Training 1/1 epoch (loss 2.5086): 53%|ββββββ | 66/125 [01:47<01:12, 1.23s/it]
Training 1/1 epoch (loss 2.5086): 54%|ββββββ | 67/125 [01:47<01:19, 1.37s/it]
Training 1/1 epoch (loss 2.6877): 54%|ββββββ | 67/125 [01:48<01:19, 1.37s/it]
Training 1/1 epoch (loss 2.6877): 54%|ββββββ | 68/125 [01:48<01:14, 1.30s/it]
Training 1/1 epoch (loss 2.6858): 54%|ββββββ | 68/125 [01:49<01:14, 1.30s/it]
Training 1/1 epoch (loss 2.6858): 55%|ββββββ | 69/125 [01:49<01:12, 1.30s/it]
Training 1/1 epoch (loss 2.4975): 55%|ββββββ | 69/125 [01:51<01:12, 1.30s/it]
Training 1/1 epoch (loss 2.4975): 56%|ββββββ | 70/125 [01:51<01:19, 1.44s/it]
Training 1/1 epoch (loss 2.6233): 56%|ββββββ | 70/125 [01:52<01:19, 1.44s/it]
Training 1/1 epoch (loss 2.6233): 57%|ββββββ | 71/125 [01:52<01:18, 1.45s/it]
Training 1/1 epoch (loss 2.6127): 57%|ββββββ | 71/125 [01:54<01:18, 1.45s/it]
Training 1/1 epoch (loss 2.6127): 58%|ββββββ | 72/125 [01:54<01:19, 1.49s/it]
Training 1/1 epoch (loss 2.7896): 58%|ββββββ | 72/125 [01:56<01:19, 1.49s/it]
Training 1/1 epoch (loss 2.7896): 58%|ββββββ | 73/125 [01:56<01:25, 1.64s/it]
Training 1/1 epoch (loss 2.7669): 58%|ββββββ | 73/125 [01:56<01:25, 1.64s/it]
Training 1/1 epoch (loss 2.7669): 59%|ββββββ | 74/125 [01:56<01:07, 1.32s/it]
Training 1/1 epoch (loss 2.6048): 59%|ββββββ | 74/125 [01:58<01:07, 1.32s/it]
Training 1/1 epoch (loss 2.6048): 60%|ββββββ | 75/125 [01:58<01:10, 1.41s/it]
Training 1/1 epoch (loss 2.6166): 60%|ββββββ | 75/125 [02:00<01:10, 1.41s/it]
Training 1/1 epoch (loss 2.6166): 61%|ββββββ | 76/125 [02:00<01:24, 1.73s/it]
Training 1/1 epoch (loss 2.7555): 61%|ββββββ | 76/125 [02:01<01:24, 1.73s/it]
Training 1/1 epoch (loss 2.7555): 62%|βββββββ | 77/125 [02:01<01:06, 1.38s/it]
Training 1/1 epoch (loss 2.4449): 62%|βββββββ | 77/125 [02:03<01:06, 1.38s/it]
Training 1/1 epoch (loss 2.4449): 62%|βββββββ | 78/125 [02:03<01:10, 1.51s/it]
Training 1/1 epoch (loss 2.6164): 62%|βββββββ | 78/125 [02:04<01:10, 1.51s/it]
Training 1/1 epoch (loss 2.6164): 63%|βββββββ | 79/125 [02:04<01:08, 1.48s/it]
Training 1/1 epoch (loss 2.6189): 63%|βββββββ | 79/125 [02:05<01:08, 1.48s/it]
Training 1/1 epoch (loss 2.6189): 64%|βββββββ | 80/125 [02:05<00:58, 1.29s/it]
Training 1/1 epoch (loss 2.6321): 64%|βββββββ | 80/125 [02:07<00:58, 1.29s/it]
Training 1/1 epoch (loss 2.6321): 65%|βββββββ | 81/125 [02:07<01:05, 1.48s/it]
Training 1/1 epoch (loss 2.4523): 65%|βββββββ | 81/125 [02:09<01:05, 1.48s/it]
Training 1/1 epoch (loss 2.4523): 66%|βββββββ | 82/125 [02:09<01:05, 1.51s/it]
Training 1/1 epoch (loss 2.7904): 66%|βββββββ | 82/125 [02:10<01:05, 1.51s/it]
Training 1/1 epoch (loss 2.7904): 66%|βββββββ | 83/125 [02:10<00:59, 1.42s/it]
Training 1/1 epoch (loss 2.6451): 66%|βββββββ | 83/125 [02:12<00:59, 1.42s/it]
Training 1/1 epoch (loss 2.6451): 67%|βββββββ | 84/125 [02:12<01:03, 1.55s/it]
Training 1/1 epoch (loss 2.6199): 67%|βββββββ | 84/125 [02:13<01:03, 1.55s/it]
Training 1/1 epoch (loss 2.6199): 68%|βββββββ | 85/125 [02:13<00:56, 1.40s/it]
Training 1/1 epoch (loss 2.5876): 68%|βββββββ | 85/125 [02:14<00:56, 1.40s/it]
Training 1/1 epoch (loss 2.5876): 69%|βββββββ | 86/125 [02:14<00:54, 1.41s/it]
Training 1/1 epoch (loss 2.6327): 69%|βββββββ | 86/125 [02:15<00:54, 1.41s/it]
Training 1/1 epoch (loss 2.6327): 70%|βββββββ | 87/125 [02:15<00:50, 1.32s/it]
Training 1/1 epoch (loss 2.6680): 70%|βββββββ | 87/125 [02:16<00:50, 1.32s/it]
Training 1/1 epoch (loss 2.6680): 70%|βββββββ | 88/125 [02:16<00:44, 1.20s/it]
Training 1/1 epoch (loss 2.5590): 70%|βββββββ | 88/125 [02:18<00:44, 1.20s/it]
Training 1/1 epoch (loss 2.5590): 71%|βββββββ | 89/125 [02:18<00:45, 1.27s/it]
Training 1/1 epoch (loss 2.6362): 71%|βββββββ | 89/125 [02:20<00:45, 1.27s/it]
Training 1/1 epoch (loss 2.6362): 72%|ββββββββ | 90/125 [02:20<00:52, 1.49s/it]
Training 1/1 epoch (loss 2.5493): 72%|ββββββββ | 90/125 [02:21<00:52, 1.49s/it]
Training 1/1 epoch (loss 2.5493): 73%|ββββββββ | 91/125 [02:21<00:45, 1.35s/it]
Training 1/1 epoch (loss 2.6860): 73%|ββββββββ | 91/125 [02:22<00:45, 1.35s/it]
Training 1/1 epoch (loss 2.6860): 74%|ββββββββ | 92/125 [02:22<00:45, 1.39s/it]
Training 1/1 epoch (loss 2.7032): 74%|ββββββββ | 92/125 [02:23<00:45, 1.39s/it]
Training 1/1 epoch (loss 2.7032): 74%|ββββββββ | 93/125 [02:23<00:41, 1.30s/it]
Training 1/1 epoch (loss 2.6358): 74%|ββββββββ | 93/125 [02:24<00:41, 1.30s/it]
Training 1/1 epoch (loss 2.6358): 75%|ββββββββ | 94/125 [02:24<00:38, 1.25s/it]
Training 1/1 epoch (loss 2.7697): 75%|ββββββββ | 94/125 [02:25<00:38, 1.25s/it]
Training 1/1 epoch (loss 2.7697): 76%|ββββββββ | 95/125 [02:25<00:35, 1.19s/it]
Training 1/1 epoch (loss 2.5604): 76%|ββββββββ | 95/125 [02:27<00:35, 1.19s/it]
Training 1/1 epoch (loss 2.5604): 77%|ββββββββ | 96/125 [02:27<00:40, 1.39s/it]
Training 1/1 epoch (loss 2.5989): 77%|ββββββββ | 96/125 [02:28<00:40, 1.39s/it]
Training 1/1 epoch (loss 2.5989): 78%|ββββββββ | 97/125 [02:28<00:33, 1.20s/it]
Training 1/1 epoch (loss 2.6149): 78%|ββββββββ | 97/125 [02:29<00:33, 1.20s/it]
Training 1/1 epoch (loss 2.6149): 78%|ββββββββ | 98/125 [02:29<00:33, 1.25s/it]
Training 1/1 epoch (loss 2.6039): 78%|ββββββββ | 98/125 [02:31<00:33, 1.25s/it]
Training 1/1 epoch (loss 2.6039): 79%|ββββββββ | 99/125 [02:31<00:33, 1.29s/it]
Training 1/1 epoch (loss 2.5074): 79%|ββββββββ | 99/125 [02:31<00:33, 1.29s/it]
Training 1/1 epoch (loss 2.5074): 80%|ββββββββ | 100/125 [02:31<00:27, 1.10s/it]
Training 1/1 epoch (loss 2.6856): 80%|ββββββββ | 100/125 [02:33<00:27, 1.10s/it]
Training 1/1 epoch (loss 2.6856): 81%|ββββββββ | 101/125 [02:33<00:32, 1.35s/it]
Training 1/1 epoch (loss 2.7893): 81%|ββββββββ | 101/125 [02:35<00:32, 1.35s/it]
Training 1/1 epoch (loss 2.7893): 82%|βββββββββ | 102/125 [02:35<00:31, 1.36s/it]
Training 1/1 epoch (loss 2.5642): 82%|βββββββββ | 102/125 [02:35<00:31, 1.36s/it]
Training 1/1 epoch (loss 2.5642): 82%|βββββββββ | 103/125 [02:35<00:24, 1.10s/it]
Training 1/1 epoch (loss 2.5981): 82%|βββββββββ | 103/125 [02:36<00:24, 1.10s/it]
Training 1/1 epoch (loss 2.5981): 83%|βββββββββ | 104/125 [02:36<00:23, 1.14s/it]
Training 1/1 epoch (loss 2.4776): 83%|βββββββββ | 104/125 [02:38<00:23, 1.14s/it]
Training 1/1 epoch (loss 2.4776): 84%|βββββββββ | 105/125 [02:38<00:24, 1.25s/it]
Training 1/1 epoch (loss 2.8480): 84%|βββββββββ | 105/125 [02:38<00:24, 1.25s/it]
Training 1/1 epoch (loss 2.8480): 85%|βββββββββ | 106/125 [02:38<00:18, 1.00it/s]
Training 1/1 epoch (loss 2.5898): 85%|βββββββββ | 106/125 [02:40<00:18, 1.00it/s]
Training 1/1 epoch (loss 2.5898): 86%|βββββββββ | 107/125 [02:40<00:21, 1.19s/it]
Training 1/1 epoch (loss 2.9464): 86%|βββββββββ | 107/125 [02:42<00:21, 1.19s/it]
Training 1/1 epoch (loss 2.9464): 86%|βββββββββ | 108/125 [02:42<00:23, 1.36s/it]
Training 1/1 epoch (loss 2.6210): 86%|βββββββββ | 108/125 [02:42<00:23, 1.36s/it]
Training 1/1 epoch (loss 2.6210): 87%|βββββββββ | 109/125 [02:42<00:17, 1.09s/it]
Training 1/1 epoch (loss 2.6291): 87%|βββββββββ | 109/125 [02:45<00:17, 1.09s/it]
Training 1/1 epoch (loss 2.6291): 88%|βββββββββ | 110/125 [02:45<00:22, 1.50s/it]
Training 1/1 epoch (loss 2.7466): 88%|βββββββββ | 110/125 [02:47<00:22, 1.50s/it]
Training 1/1 epoch (loss 2.7466): 89%|βββββββββ | 111/125 [02:47<00:22, 1.60s/it]
Training 1/1 epoch (loss 2.5638): 89%|βββββββββ | 111/125 [02:48<00:22, 1.60s/it]
Training 1/1 epoch (loss 2.5638): 90%|βββββββββ | 112/125 [02:48<00:18, 1.46s/it]
Training 1/1 epoch (loss 2.6025): 90%|βββββββββ | 112/125 [02:50<00:18, 1.46s/it]
Training 1/1 epoch (loss 2.6025): 90%|βββββββββ | 113/125 [02:50<00:21, 1.79s/it]
Training 1/1 epoch (loss 2.5891): 90%|βββββββββ | 113/125 [02:51<00:21, 1.79s/it]
Training 1/1 epoch (loss 2.5891): 91%|βββββββββ | 114/125 [02:51<00:17, 1.57s/it]
Training 1/1 epoch (loss 2.6312): 91%|βββββββββ | 114/125 [02:52<00:17, 1.57s/it]
Training 1/1 epoch (loss 2.6312): 92%|ββββββββββ| 115/125 [02:52<00:13, 1.35s/it]
Training 1/1 epoch (loss 2.7189): 92%|ββββββββββ| 115/125 [02:53<00:13, 1.35s/it]
Training 1/1 epoch (loss 2.7189): 93%|ββββββββββ| 116/125 [02:53<00:12, 1.34s/it]
Training 1/1 epoch (loss 2.6378): 93%|ββββββββββ| 116/125 [02:54<00:12, 1.34s/it]
Training 1/1 epoch (loss 2.6378): 94%|ββββββββββ| 117/125 [02:54<00:09, 1.19s/it]
Training 1/1 epoch (loss 2.5734): 94%|ββββββββββ| 117/125 [02:56<00:09, 1.19s/it]
Training 1/1 epoch (loss 2.5734): 94%|ββββββββββ| 118/125 [02:56<00:08, 1.25s/it]
Training 1/1 epoch (loss 2.5554): 94%|ββββββββββ| 118/125 [02:57<00:08, 1.25s/it]
Training 1/1 epoch (loss 2.5554): 95%|ββββββββββ| 119/125 [02:57<00:08, 1.34s/it]
Training 1/1 epoch (loss 2.9119): 95%|ββββββββββ| 119/125 [02:58<00:08, 1.34s/it]
Training 1/1 epoch (loss 2.9119): 96%|ββββββββββ| 120/125 [02:58<00:06, 1.25s/it]
Training 1/1 epoch (loss 2.5764): 96%|ββββββββββ| 120/125 [03:00<00:06, 1.25s/it]
Training 1/1 epoch (loss 2.5764): 97%|ββββββββββ| 121/125 [03:00<00:05, 1.34s/it]
Training 1/1 epoch (loss 2.6017): 97%|ββββββββββ| 121/125 [03:01<00:05, 1.34s/it]
Training 1/1 epoch (loss 2.6017): 98%|ββββββββββ| 122/125 [03:01<00:03, 1.33s/it]
Training 1/1 epoch (loss 2.5382): 98%|ββββββββββ| 122/125 [03:02<00:03, 1.33s/it]
Training 1/1 epoch (loss 2.5382): 98%|ββββββββββ| 123/125 [03:02<00:02, 1.17s/it]
Training 1/1 epoch (loss 2.6568): 98%|ββββββββββ| 123/125 [03:04<00:02, 1.17s/it]
Training 1/1 epoch (loss 2.6568): 99%|ββββββββββ| 124/125 [03:04<00:01, 1.55s/it]
Training 1/1 epoch (loss 2.6418): 99%|ββββββββββ| 124/125 [03:06<00:01, 1.55s/it]
Training 1/1 epoch (loss 2.6418): 100%|ββββββββββ| 125/125 [03:06<00:00, 1.71s/it]
Training 1/1 epoch (loss 2.6418): 100%|ββββββββββ| 125/125 [03:06<00:00, 1.50s/it] |
|
tokenizer config file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000-Q2-1000/tokenizer_config.json |
|
Special tokens file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-1000-Q2-1000/special_tokens_map.json |
|
wandb: ERROR Problem finishing run |
|
Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x1550cc7ce510>> |
|
Traceback (most recent call last): |
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper |
|
return func(*args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^ |
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close |
|
self.wandb.finish() |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 503, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 451, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2309, in finish |
|
return self._finish(exit_code) |
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2337, in _finish |
|
self._atexit_cleanup(exit_code=exit_code) |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2550, in _atexit_cleanup |
|
self._on_finish() |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2806, in _on_finish |
|
wait_with_progress( |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress |
|
return wait_all_with_progress( |
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress |
|
return asyncio_compat.run(progress_loop_with_timeout) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run |
|
future = executor.submit(runner.run, fn) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/concurrent/futures/thread.py", line 169, in submit |
|
raise RuntimeError( |
|
RuntimeError: cannot schedule new futures after interpreter shutdown |
|
|