|
+ deepspeed --master_port 58025 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json --model_name_or_path /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000 --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000-Q2-2000 --log_type wandb --log_run_name imdb-tinyllama-2T-s3-Q1-1000-Q2-2000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
[rank6]:[W527 16:08:47.795486180 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank3]:[W527 16:08:47.797175626 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank7]:[W527 16:08:47.801302410 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank2]:[W527 16:08:47.811708090 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank4]:[W527 16:08:47.813442418 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank0]:[W527 16:08:47.815279122 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank1]:[W527 16:08:47.816501140 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank5]:[W527 16:08:47.816522987 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/config.json |
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/pytorch_model.bin |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/pytorch_model.bin |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/pytorch_model.bin |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file added_tokens.json |
|
loading file added_tokens.json |
|
loading file added_tokens.json |
|
loading file added_tokens.json |
|
loading file added_tokens.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file special_tokens_map.json |
|
loading file special_tokens_map.json |
|
loading file special_tokens_map.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file tokenizer_config.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file chat_template.jinja |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file chat_template.jinja |
|
loading file chat_template.jinja |
|
loading file chat_template.jinja |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Detected CUDA files, patching ldflags |
|
Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... |
|
/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. |
|
If this is not desired, please set os.environ[ |
|
warnings.warn( |
|
Building extension module fused_adam... |
|
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam...Loading extension module fused_adam... |
|
|
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin |
|
wandb: Tracking run with wandb version 0.19.11 |
|
wandb: Run data is saved locally in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000-Q2-2000/wandb/run-20250527_160905-w42kp8cp |
|
wandb: Run `wandb offline` to turn off syncing. |
|
wandb: Syncing run imdb-tinyllama-2T-s3-Q1-1000-Q2-2000 |
|
wandb: βοΈ View project at https://wandb.ai/xtom/Inverse_Alignment_IMDb |
|
wandb: π View run at https://wandb.ai/xtom/Inverse_Alignment_IMDb/runs/w42kp8cp |
|
Training 1/1 epoch: 0%| | 0/250 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
Training 1/1 epoch (loss 2.8327): 0%| | 0/250 [00:07<?, ?it/s]
Training 1/1 epoch (loss 2.8327): 0%| | 1/250 [00:07<32:41, 7.88s/it]
Training 1/1 epoch (loss 2.8658): 0%| | 1/250 [00:10<32:41, 7.88s/it]
Training 1/1 epoch (loss 2.8658): 1%| | 2/250 [00:10<20:20, 4.92s/it]
Training 1/1 epoch (loss 2.8360): 1%| | 2/250 [00:11<20:20, 4.92s/it]
Training 1/1 epoch (loss 2.8360): 1%| | 3/250 [00:11<12:48, 3.11s/it]
Training 1/1 epoch (loss 2.6588): 1%| | 3/250 [00:14<12:48, 3.11s/it]
Training 1/1 epoch (loss 2.6588): 2%|β | 4/250 [00:14<11:34, 2.82s/it]
Training 1/1 epoch (loss 2.7341): 2%|β | 4/250 [00:16<11:34, 2.82s/it]
Training 1/1 epoch (loss 2.7341): 2%|β | 5/250 [00:16<10:26, 2.56s/it]
Training 1/1 epoch (loss 3.1292): 2%|β | 5/250 [00:16<10:26, 2.56s/it]
Training 1/1 epoch (loss 3.1292): 2%|β | 6/250 [00:16<07:39, 1.88s/it]
Training 1/1 epoch (loss 2.8984): 2%|β | 6/250 [00:18<07:39, 1.88s/it]
Training 1/1 epoch (loss 2.8984): 3%|β | 7/250 [00:18<07:44, 1.91s/it]
Training 1/1 epoch (loss 2.7593): 3%|β | 7/250 [00:20<07:44, 1.91s/it]
Training 1/1 epoch (loss 2.7593): 3%|β | 8/250 [00:20<07:23, 1.83s/it]
Training 1/1 epoch (loss 2.8483): 3%|β | 8/250 [00:21<07:23, 1.83s/it]
Training 1/1 epoch (loss 2.8483): 4%|β | 9/250 [00:21<06:19, 1.57s/it]
Training 1/1 epoch (loss 2.7407): 4%|β | 9/250 [00:22<06:19, 1.57s/it]
Training 1/1 epoch (loss 2.7407): 4%|β | 10/250 [00:22<05:32, 1.39s/it]
Training 1/1 epoch (loss 2.4931): 4%|β | 10/250 [00:24<05:32, 1.39s/it]
Training 1/1 epoch (loss 2.4931): 4%|β | 11/250 [00:24<05:59, 1.50s/it]
Training 1/1 epoch (loss 2.7044): 4%|β | 11/250 [00:25<05:59, 1.50s/it]
Training 1/1 epoch (loss 2.7044): 5%|β | 12/250 [00:25<05:19, 1.34s/it]
Training 1/1 epoch (loss 3.0594): 5%|β | 12/250 [00:27<05:19, 1.34s/it]
Training 1/1 epoch (loss 3.0594): 5%|β | 13/250 [00:27<06:14, 1.58s/it]
Training 1/1 epoch (loss 2.7175): 5%|β | 13/250 [00:28<06:14, 1.58s/it]
Training 1/1 epoch (loss 2.7175): 6%|β | 14/250 [00:28<05:39, 1.44s/it]
Training 1/1 epoch (loss 2.5632): 6%|β | 14/250 [00:29<05:39, 1.44s/it]
Training 1/1 epoch (loss 2.5632): 6%|β | 15/250 [00:29<04:58, 1.27s/it]
Training 1/1 epoch (loss 2.6712): 6%|β | 15/250 [00:31<04:58, 1.27s/it]
Training 1/1 epoch (loss 2.6712): 6%|β | 16/250 [00:31<05:46, 1.48s/it]
Training 1/1 epoch (loss 2.9078): 6%|β | 16/250 [00:32<05:46, 1.48s/it]
Training 1/1 epoch (loss 2.9078): 7%|β | 17/250 [00:32<05:31, 1.42s/it]
Training 1/1 epoch (loss 2.6559): 7%|β | 17/250 [00:34<05:31, 1.42s/it]
Training 1/1 epoch (loss 2.6559): 7%|β | 18/250 [00:34<06:11, 1.60s/it]
Training 1/1 epoch (loss 2.7091): 7%|β | 18/250 [00:36<06:11, 1.60s/it]
Training 1/1 epoch (loss 2.7091): 8%|β | 19/250 [00:36<06:05, 1.58s/it]
Training 1/1 epoch (loss 2.8137): 8%|β | 19/250 [00:36<06:05, 1.58s/it]
Training 1/1 epoch (loss 2.8137): 8%|β | 20/250 [00:36<05:05, 1.33s/it]
Training 1/1 epoch (loss 2.8219): 8%|β | 20/250 [00:39<05:05, 1.33s/it]
Training 1/1 epoch (loss 2.8219): 8%|β | 21/250 [00:39<06:22, 1.67s/it]
Training 1/1 epoch (loss 2.9877): 8%|β | 21/250 [00:41<06:22, 1.67s/it]
Training 1/1 epoch (loss 2.9877): 9%|β | 22/250 [00:41<06:42, 1.77s/it]
Training 1/1 epoch (loss 2.8472): 9%|β | 22/250 [00:41<06:42, 1.77s/it]
Training 1/1 epoch (loss 2.8472): 9%|β | 23/250 [00:41<05:12, 1.38s/it]
Training 1/1 epoch (loss 2.9751): 9%|β | 23/250 [00:43<05:12, 1.38s/it]
Training 1/1 epoch (loss 2.9751): 10%|β | 24/250 [00:43<05:25, 1.44s/it]
Training 1/1 epoch (loss 2.7497): 10%|β | 24/250 [00:45<05:25, 1.44s/it]
Training 1/1 epoch (loss 2.7497): 10%|β | 25/250 [00:45<05:46, 1.54s/it]
Training 1/1 epoch (loss 2.8018): 10%|β | 25/250 [00:45<05:46, 1.54s/it]
Training 1/1 epoch (loss 2.8018): 10%|β | 26/250 [00:45<04:31, 1.21s/it]
Training 1/1 epoch (loss 2.5768): 10%|β | 26/250 [00:47<04:31, 1.21s/it]
Training 1/1 epoch (loss 2.5768): 11%|β | 27/250 [00:47<05:27, 1.47s/it]
Training 1/1 epoch (loss 2.7097): 11%|β | 27/250 [00:49<05:27, 1.47s/it]
Training 1/1 epoch (loss 2.7097): 11%|β | 28/250 [00:49<05:33, 1.50s/it]
Training 1/1 epoch (loss 2.7825): 11%|β | 28/250 [00:50<05:33, 1.50s/it]
Training 1/1 epoch (loss 2.7825): 12%|ββ | 29/250 [00:50<04:56, 1.34s/it]
Training 1/1 epoch (loss 2.9220): 12%|ββ | 29/250 [00:51<04:56, 1.34s/it]
Training 1/1 epoch (loss 2.9220): 12%|ββ | 30/250 [00:51<04:40, 1.27s/it]
Training 1/1 epoch (loss 2.7745): 12%|ββ | 30/250 [00:52<04:40, 1.27s/it]
Training 1/1 epoch (loss 2.7745): 12%|ββ | 31/250 [00:52<04:32, 1.25s/it]
Training 1/1 epoch (loss 2.7967): 12%|ββ | 31/250 [00:53<04:32, 1.25s/it]
Training 1/1 epoch (loss 2.7967): 13%|ββ | 32/250 [00:53<04:38, 1.28s/it]
Training 1/1 epoch (loss 2.9162): 13%|ββ | 32/250 [00:56<04:38, 1.28s/it]
Training 1/1 epoch (loss 2.9162): 13%|ββ | 33/250 [00:56<05:57, 1.65s/it]
Training 1/1 epoch (loss 2.5924): 13%|ββ | 33/250 [00:57<05:57, 1.65s/it]
Training 1/1 epoch (loss 2.5924): 14%|ββ | 34/250 [00:57<05:25, 1.51s/it]
Training 1/1 epoch (loss 2.5276): 14%|ββ | 34/250 [00:58<05:25, 1.51s/it]
Training 1/1 epoch (loss 2.5276): 14%|ββ | 35/250 [00:58<05:22, 1.50s/it]
Training 1/1 epoch (loss 2.9826): 14%|ββ | 35/250 [01:00<05:22, 1.50s/it]
Training 1/1 epoch (loss 2.9826): 14%|ββ | 36/250 [01:00<05:13, 1.46s/it]
Training 1/1 epoch (loss 2.7090): 14%|ββ | 36/250 [01:01<05:13, 1.46s/it]
Training 1/1 epoch (loss 2.7090): 15%|ββ | 37/250 [01:01<04:34, 1.29s/it]
Training 1/1 epoch (loss 2.7532): 15%|ββ | 37/250 [01:03<04:34, 1.29s/it]
Training 1/1 epoch (loss 2.7532): 15%|ββ | 38/250 [01:03<05:50, 1.65s/it]
Training 1/1 epoch (loss 2.8025): 15%|ββ | 38/250 [01:04<05:50, 1.65s/it]
Training 1/1 epoch (loss 2.8025): 16%|ββ | 39/250 [01:04<05:26, 1.55s/it]
Training 1/1 epoch (loss 2.4618): 16%|ββ | 39/250 [01:06<05:26, 1.55s/it]
Training 1/1 epoch (loss 2.4618): 16%|ββ | 40/250 [01:06<05:06, 1.46s/it]
Training 1/1 epoch (loss 2.7450): 16%|ββ | 40/250 [01:08<05:06, 1.46s/it]
Training 1/1 epoch (loss 2.7450): 16%|ββ | 41/250 [01:08<05:28, 1.57s/it]
Training 1/1 epoch (loss 2.7078): 16%|ββ | 41/250 [01:09<05:28, 1.57s/it]
Training 1/1 epoch (loss 2.7078): 17%|ββ | 42/250 [01:09<05:18, 1.53s/it]
Training 1/1 epoch (loss 2.6009): 17%|ββ | 42/250 [01:11<05:18, 1.53s/it]
Training 1/1 epoch (loss 2.6009): 17%|ββ | 43/250 [01:11<05:28, 1.59s/it]
Training 1/1 epoch (loss 2.6622): 17%|ββ | 43/250 [01:13<05:28, 1.59s/it]
Training 1/1 epoch (loss 2.6622): 18%|ββ | 44/250 [01:13<05:41, 1.66s/it]
Training 1/1 epoch (loss 2.8190): 18%|ββ | 44/250 [01:13<05:41, 1.66s/it]
Training 1/1 epoch (loss 2.8190): 18%|ββ | 45/250 [01:13<04:46, 1.40s/it]
Training 1/1 epoch (loss 2.8833): 18%|ββ | 45/250 [01:15<04:46, 1.40s/it]
Training 1/1 epoch (loss 2.8833): 18%|ββ | 46/250 [01:15<04:50, 1.42s/it]
Training 1/1 epoch (loss 2.8682): 18%|ββ | 46/250 [01:16<04:50, 1.42s/it]
Training 1/1 epoch (loss 2.8682): 19%|ββ | 47/250 [01:16<04:39, 1.38s/it]
Training 1/1 epoch (loss 2.8500): 19%|ββ | 47/250 [01:17<04:39, 1.38s/it]
Training 1/1 epoch (loss 2.8500): 19%|ββ | 48/250 [01:17<03:57, 1.17s/it]
Training 1/1 epoch (loss 2.7100): 19%|ββ | 48/250 [01:18<03:57, 1.17s/it]
Training 1/1 epoch (loss 2.7100): 20%|ββ | 49/250 [01:18<04:08, 1.23s/it]
Training 1/1 epoch (loss 2.7277): 20%|ββ | 49/250 [01:20<04:08, 1.23s/it]
Training 1/1 epoch (loss 2.7277): 20%|ββ | 50/250 [01:20<04:50, 1.45s/it]
Training 1/1 epoch (loss 2.8431): 20%|ββ | 50/250 [01:21<04:50, 1.45s/it]
Training 1/1 epoch (loss 2.8431): 20%|ββ | 51/250 [01:21<04:08, 1.25s/it]
Training 1/1 epoch (loss 2.8169): 20%|ββ | 51/250 [01:23<04:08, 1.25s/it]
Training 1/1 epoch (loss 2.8169): 21%|ββ | 52/250 [01:23<04:40, 1.42s/it]
Training 1/1 epoch (loss 2.6882): 21%|ββ | 52/250 [01:24<04:40, 1.42s/it]
Training 1/1 epoch (loss 2.6882): 21%|ββ | 53/250 [01:24<04:41, 1.43s/it]
Training 1/1 epoch (loss 2.8664): 21%|ββ | 53/250 [01:25<04:41, 1.43s/it]
Training 1/1 epoch (loss 2.8664): 22%|βββ | 54/250 [01:25<04:17, 1.31s/it]
Training 1/1 epoch (loss 2.5920): 22%|βββ | 54/250 [01:26<04:17, 1.31s/it]
Training 1/1 epoch (loss 2.5920): 22%|βββ | 55/250 [01:26<04:02, 1.25s/it]
Training 1/1 epoch (loss 2.5944): 22%|βββ | 55/250 [01:28<04:02, 1.25s/it]
Training 1/1 epoch (loss 2.5944): 22%|βββ | 56/250 [01:28<04:25, 1.37s/it]
Training 1/1 epoch (loss 2.7095): 22%|βββ | 56/250 [01:29<04:25, 1.37s/it]
Training 1/1 epoch (loss 2.7095): 23%|βββ | 57/250 [01:29<04:04, 1.27s/it]
Training 1/1 epoch (loss 2.9662): 23%|βββ | 57/250 [01:30<04:04, 1.27s/it]
Training 1/1 epoch (loss 2.9662): 23%|βββ | 58/250 [01:30<04:12, 1.32s/it]
Training 1/1 epoch (loss 3.0020): 23%|βββ | 58/250 [01:31<04:12, 1.32s/it]
Training 1/1 epoch (loss 3.0020): 24%|βββ | 59/250 [01:31<03:44, 1.17s/it]
Training 1/1 epoch (loss 2.7076): 24%|βββ | 59/250 [01:32<03:44, 1.17s/it]
Training 1/1 epoch (loss 2.7076): 24%|βββ | 60/250 [01:32<03:21, 1.06s/it]
Training 1/1 epoch (loss 2.6651): 24%|βββ | 60/250 [01:33<03:21, 1.06s/it]
Training 1/1 epoch (loss 2.6651): 24%|βββ | 61/250 [01:33<03:35, 1.14s/it]
Training 1/1 epoch (loss 2.7907): 24%|βββ | 61/250 [01:34<03:35, 1.14s/it]
Training 1/1 epoch (loss 2.7907): 25%|βββ | 62/250 [01:34<03:21, 1.07s/it]
Training 1/1 epoch (loss 2.8172): 25%|βββ | 62/250 [01:36<03:21, 1.07s/it]
Training 1/1 epoch (loss 2.8172): 25%|βββ | 63/250 [01:36<03:30, 1.13s/it]
Training 1/1 epoch (loss 2.6557): 25%|βββ | 63/250 [01:38<03:30, 1.13s/it]
Training 1/1 epoch (loss 2.6557): 26%|βββ | 64/250 [01:38<04:29, 1.45s/it]
Training 1/1 epoch (loss 2.7598): 26%|βββ | 64/250 [01:39<04:29, 1.45s/it]
Training 1/1 epoch (loss 2.7598): 26%|βββ | 65/250 [01:39<04:03, 1.32s/it]
Training 1/1 epoch (loss 2.7671): 26%|βββ | 65/250 [01:40<04:03, 1.32s/it]
Training 1/1 epoch (loss 2.7671): 26%|βββ | 66/250 [01:40<04:08, 1.35s/it]
Training 1/1 epoch (loss 2.7383): 26%|βββ | 66/250 [01:42<04:08, 1.35s/it]
Training 1/1 epoch (loss 2.7383): 27%|βββ | 67/250 [01:42<04:31, 1.48s/it]
Training 1/1 epoch (loss 2.5973): 27%|βββ | 67/250 [01:43<04:31, 1.48s/it]
Training 1/1 epoch (loss 2.5973): 27%|βββ | 68/250 [01:43<03:39, 1.21s/it]
Training 1/1 epoch (loss 2.6034): 27%|βββ | 68/250 [01:44<03:39, 1.21s/it]
Training 1/1 epoch (loss 2.6034): 28%|βββ | 69/250 [01:44<03:44, 1.24s/it]
Training 1/1 epoch (loss 2.8854): 28%|βββ | 69/250 [01:46<03:44, 1.24s/it]
Training 1/1 epoch (loss 2.8854): 28%|βββ | 70/250 [01:46<04:33, 1.52s/it]
Training 1/1 epoch (loss 2.6863): 28%|βββ | 70/250 [01:46<04:33, 1.52s/it]
Training 1/1 epoch (loss 2.6863): 28%|βββ | 71/250 [01:46<03:34, 1.20s/it]
Training 1/1 epoch (loss 2.8395): 28%|βββ | 71/250 [01:49<03:34, 1.20s/it]
Training 1/1 epoch (loss 2.8395): 29%|βββ | 72/250 [01:49<04:21, 1.47s/it]
Training 1/1 epoch (loss 2.5067): 29%|βββ | 72/250 [01:50<04:21, 1.47s/it]
Training 1/1 epoch (loss 2.5067): 29%|βββ | 73/250 [01:50<03:59, 1.36s/it]
Training 1/1 epoch (loss 2.6173): 29%|βββ | 73/250 [01:50<03:59, 1.36s/it]
Training 1/1 epoch (loss 2.6173): 30%|βββ | 74/250 [01:50<03:29, 1.19s/it]
Training 1/1 epoch (loss 2.8487): 30%|βββ | 74/250 [01:52<03:29, 1.19s/it]
Training 1/1 epoch (loss 2.8487): 30%|βββ | 75/250 [01:52<03:46, 1.29s/it]
Training 1/1 epoch (loss 2.5918): 30%|βββ | 75/250 [01:53<03:46, 1.29s/it]
Training 1/1 epoch (loss 2.5918): 30%|βββ | 76/250 [01:53<03:49, 1.32s/it]
Training 1/1 epoch (loss 2.5394): 30%|βββ | 76/250 [01:54<03:49, 1.32s/it]
Training 1/1 epoch (loss 2.5394): 31%|βββ | 77/250 [01:54<03:01, 1.05s/it]
Training 1/1 epoch (loss 2.6510): 31%|βββ | 77/250 [01:56<03:01, 1.05s/it]
Training 1/1 epoch (loss 2.6510): 31%|βββ | 78/250 [01:56<03:34, 1.25s/it]
Training 1/1 epoch (loss 2.7729): 31%|βββ | 78/250 [01:58<03:34, 1.25s/it]
Training 1/1 epoch (loss 2.7729): 32%|ββββ | 79/250 [01:58<04:19, 1.52s/it]
Training 1/1 epoch (loss 2.7221): 32%|ββββ | 79/250 [01:59<04:19, 1.52s/it]
Training 1/1 epoch (loss 2.7221): 32%|ββββ | 80/250 [01:59<03:50, 1.36s/it]
Training 1/1 epoch (loss 2.8683): 32%|ββββ | 80/250 [02:00<03:50, 1.36s/it]
Training 1/1 epoch (loss 2.8683): 32%|ββββ | 81/250 [02:00<03:33, 1.26s/it]
Training 1/1 epoch (loss 2.8933): 32%|ββββ | 81/250 [02:02<03:33, 1.26s/it]
Training 1/1 epoch (loss 2.8933): 33%|ββββ | 82/250 [02:02<04:15, 1.52s/it]
Training 1/1 epoch (loss 2.7869): 33%|ββββ | 82/250 [02:02<04:15, 1.52s/it]
Training 1/1 epoch (loss 2.7869): 33%|ββββ | 83/250 [02:02<03:27, 1.24s/it]
Training 1/1 epoch (loss 2.6405): 33%|ββββ | 83/250 [02:04<03:27, 1.24s/it]
Training 1/1 epoch (loss 2.6405): 34%|ββββ | 84/250 [02:04<03:53, 1.41s/it]
Training 1/1 epoch (loss 2.8538): 34%|ββββ | 84/250 [02:06<03:53, 1.41s/it]
Training 1/1 epoch (loss 2.8538): 34%|ββββ | 85/250 [02:06<04:19, 1.58s/it]
Training 1/1 epoch (loss 2.7026): 34%|ββββ | 85/250 [02:07<04:19, 1.58s/it]
Training 1/1 epoch (loss 2.7026): 34%|ββββ | 86/250 [02:07<03:22, 1.24s/it]
Training 1/1 epoch (loss 2.7400): 34%|ββββ | 86/250 [02:09<03:22, 1.24s/it]
Training 1/1 epoch (loss 2.7400): 35%|ββββ | 87/250 [02:09<04:21, 1.60s/it]
Training 1/1 epoch (loss 2.7234): 35%|ββββ | 87/250 [02:11<04:21, 1.60s/it]
Training 1/1 epoch (loss 2.7234): 35%|ββββ | 88/250 [02:11<04:14, 1.57s/it]
Training 1/1 epoch (loss 2.7721): 35%|ββββ | 88/250 [02:12<04:14, 1.57s/it]
Training 1/1 epoch (loss 2.7721): 36%|ββββ | 89/250 [02:12<03:52, 1.45s/it]
Training 1/1 epoch (loss 2.8597): 36%|ββββ | 89/250 [02:13<03:52, 1.45s/it]
Training 1/1 epoch (loss 2.8597): 36%|ββββ | 90/250 [02:13<03:45, 1.41s/it]
Training 1/1 epoch (loss 2.7913): 36%|ββββ | 90/250 [02:14<03:45, 1.41s/it]
Training 1/1 epoch (loss 2.7913): 36%|ββββ | 91/250 [02:14<03:28, 1.31s/it]
Training 1/1 epoch (loss 2.6044): 36%|ββββ | 91/250 [02:15<03:28, 1.31s/it]
Training 1/1 epoch (loss 2.6044): 37%|ββββ | 92/250 [02:15<03:23, 1.29s/it]
Training 1/1 epoch (loss 2.4312): 37%|ββββ | 92/250 [02:17<03:23, 1.29s/it]
Training 1/1 epoch (loss 2.4312): 37%|ββββ | 93/250 [02:17<03:39, 1.40s/it]
Training 1/1 epoch (loss 2.7861): 37%|ββββ | 93/250 [02:18<03:39, 1.40s/it]
Training 1/1 epoch (loss 2.7861): 38%|ββββ | 94/250 [02:18<03:32, 1.36s/it]
Training 1/1 epoch (loss 2.8403): 38%|ββββ | 94/250 [02:19<03:32, 1.36s/it]
Training 1/1 epoch (loss 2.8403): 38%|ββββ | 95/250 [02:19<03:08, 1.21s/it]
Training 1/1 epoch (loss 2.6752): 38%|ββββ | 95/250 [02:21<03:08, 1.21s/it]
Training 1/1 epoch (loss 2.6752): 38%|ββββ | 96/250 [02:21<03:45, 1.46s/it]
Training 1/1 epoch (loss 2.7539): 38%|ββββ | 96/250 [02:22<03:45, 1.46s/it]
Training 1/1 epoch (loss 2.7539): 39%|ββββ | 97/250 [02:22<03:17, 1.29s/it]
Training 1/1 epoch (loss 2.7448): 39%|ββββ | 97/250 [02:23<03:17, 1.29s/it]
Training 1/1 epoch (loss 2.7448): 39%|ββββ | 98/250 [02:23<03:15, 1.29s/it]
Training 1/1 epoch (loss 2.7865): 39%|ββββ | 98/250 [02:25<03:15, 1.29s/it]
Training 1/1 epoch (loss 2.7865): 40%|ββββ | 99/250 [02:25<03:26, 1.37s/it]
Training 1/1 epoch (loss 2.7721): 40%|ββββ | 99/250 [02:26<03:26, 1.37s/it]
Training 1/1 epoch (loss 2.7721): 40%|ββββ | 100/250 [02:26<02:53, 1.16s/it]
Training 1/1 epoch (loss 2.7016): 40%|ββββ | 100/250 [02:27<02:53, 1.16s/it]
Training 1/1 epoch (loss 2.7016): 40%|ββββ | 101/250 [02:27<03:12, 1.29s/it]
Training 1/1 epoch (loss 2.7717): 40%|ββββ | 101/250 [02:29<03:12, 1.29s/it]
Training 1/1 epoch (loss 2.7717): 41%|ββββ | 102/250 [02:29<03:40, 1.49s/it]
Training 1/1 epoch (loss 2.8356): 41%|ββββ | 102/250 [02:30<03:40, 1.49s/it]
Training 1/1 epoch (loss 2.8356): 41%|ββββ | 103/250 [02:30<03:10, 1.29s/it]
Training 1/1 epoch (loss 2.6755): 41%|ββββ | 103/250 [02:32<03:10, 1.29s/it]
Training 1/1 epoch (loss 2.6755): 42%|βββββ | 104/250 [02:32<04:00, 1.65s/it]
Training 1/1 epoch (loss 2.7644): 42%|βββββ | 104/250 [02:34<04:00, 1.65s/it]
Training 1/1 epoch (loss 2.7644): 42%|βββββ | 105/250 [02:34<03:50, 1.59s/it]
Training 1/1 epoch (loss 2.4983): 42%|βββββ | 105/250 [02:35<03:50, 1.59s/it]
Training 1/1 epoch (loss 2.4983): 42%|βββββ | 106/250 [02:35<03:12, 1.34s/it]
Training 1/1 epoch (loss 2.8663): 42%|βββββ | 106/250 [02:37<03:12, 1.34s/it]
Training 1/1 epoch (loss 2.8663): 43%|βββββ | 107/250 [02:37<03:33, 1.49s/it]
Training 1/1 epoch (loss 2.9266): 43%|βββββ | 107/250 [02:38<03:33, 1.49s/it]
Training 1/1 epoch (loss 2.9266): 43%|βββββ | 108/250 [02:38<03:25, 1.45s/it]
Training 1/1 epoch (loss 2.6324): 43%|βββββ | 108/250 [02:39<03:25, 1.45s/it]
Training 1/1 epoch (loss 2.6324): 44%|βββββ | 109/250 [02:39<02:57, 1.26s/it]
Training 1/1 epoch (loss 2.7190): 44%|βββββ | 109/250 [02:40<02:57, 1.26s/it]
Training 1/1 epoch (loss 2.7190): 44%|βββββ | 110/250 [02:40<03:18, 1.42s/it]
Training 1/1 epoch (loss 2.6988): 44%|βββββ | 110/250 [02:42<03:18, 1.42s/it]
Training 1/1 epoch (loss 2.6988): 44%|βββββ | 111/250 [02:42<03:23, 1.46s/it]
Training 1/1 epoch (loss 2.8055): 44%|βββββ | 111/250 [02:43<03:23, 1.46s/it]
Training 1/1 epoch (loss 2.8055): 45%|βββββ | 112/250 [02:43<02:54, 1.26s/it]
Training 1/1 epoch (loss 2.8091): 45%|βββββ | 112/250 [02:45<02:54, 1.26s/it]
Training 1/1 epoch (loss 2.8091): 45%|βββββ | 113/250 [02:45<03:44, 1.64s/it]
Training 1/1 epoch (loss 2.8238): 45%|βββββ | 113/250 [02:46<03:44, 1.64s/it]
Training 1/1 epoch (loss 2.8238): 46%|βββββ | 114/250 [02:46<03:15, 1.43s/it]
Training 1/1 epoch (loss 2.6247): 46%|βββββ | 114/250 [02:47<03:15, 1.43s/it]
Training 1/1 epoch (loss 2.6247): 46%|βββββ | 115/250 [02:47<02:31, 1.12s/it]
Training 1/1 epoch (loss 2.7645): 46%|βββββ | 115/250 [02:48<02:31, 1.12s/it]
Training 1/1 epoch (loss 2.7645): 46%|βββββ | 116/250 [02:48<02:54, 1.30s/it]
Training 1/1 epoch (loss 2.8258): 46%|βββββ | 116/250 [02:51<02:54, 1.30s/it]
Training 1/1 epoch (loss 2.8258): 47%|βββββ | 117/250 [02:51<03:40, 1.66s/it]
Training 1/1 epoch (loss 2.6829): 47%|βββββ | 117/250 [02:51<03:40, 1.66s/it]
Training 1/1 epoch (loss 2.6829): 47%|βββββ | 118/250 [02:51<02:53, 1.32s/it]
Training 1/1 epoch (loss 2.7550): 47%|βββββ | 118/250 [02:53<02:53, 1.32s/it]
Training 1/1 epoch (loss 2.7550): 48%|βββββ | 119/250 [02:53<03:10, 1.45s/it]
Training 1/1 epoch (loss 2.7538): 48%|βββββ | 119/250 [02:55<03:10, 1.45s/it]
Training 1/1 epoch (loss 2.7538): 48%|βββββ | 120/250 [02:55<03:30, 1.62s/it]
Training 1/1 epoch (loss 2.7670): 48%|βββββ | 120/250 [02:56<03:30, 1.62s/it]
Training 1/1 epoch (loss 2.7670): 48%|βββββ | 121/250 [02:56<03:05, 1.44s/it]
Training 1/1 epoch (loss 2.6386): 48%|βββββ | 121/250 [02:58<03:05, 1.44s/it]
Training 1/1 epoch (loss 2.6386): 49%|βββββ | 122/250 [02:58<03:19, 1.56s/it]
Training 1/1 epoch (loss 2.7396): 49%|βββββ | 122/250 [02:59<03:19, 1.56s/it]
Training 1/1 epoch (loss 2.7396): 49%|βββββ | 123/250 [02:59<02:57, 1.40s/it]
Training 1/1 epoch (loss 2.9200): 49%|βββββ | 123/250 [03:00<02:57, 1.40s/it]
Training 1/1 epoch (loss 2.9200): 50%|βββββ | 124/250 [03:00<02:48, 1.33s/it]
Training 1/1 epoch (loss 2.7057): 50%|βββββ | 124/250 [03:02<02:48, 1.33s/it]
Training 1/1 epoch (loss 2.7057): 50%|βββββ | 125/250 [03:02<02:47, 1.34s/it]
Training 1/1 epoch (loss 2.9420): 50%|βββββ | 125/250 [03:03<02:47, 1.34s/it]
Training 1/1 epoch (loss 2.9420): 50%|βββββ | 126/250 [03:03<02:54, 1.41s/it]
Training 1/1 epoch (loss 2.8064): 50%|βββββ | 126/250 [03:04<02:54, 1.41s/it]
Training 1/1 epoch (loss 2.8064): 51%|βββββ | 127/250 [03:04<02:38, 1.29s/it]
Training 1/1 epoch (loss 2.9248): 51%|βββββ | 127/250 [03:07<02:38, 1.29s/it]
Training 1/1 epoch (loss 2.9248): 51%|βββββ | 128/250 [03:07<03:18, 1.62s/it]
Training 1/1 epoch (loss 2.6603): 51%|βββββ | 128/250 [03:08<03:18, 1.62s/it]
Training 1/1 epoch (loss 2.6603): 52%|ββββββ | 129/250 [03:08<02:52, 1.42s/it]
Training 1/1 epoch (loss 2.6543): 52%|ββββββ | 129/250 [03:09<02:52, 1.42s/it]
Training 1/1 epoch (loss 2.6543): 52%|ββββββ | 130/250 [03:09<02:57, 1.48s/it]
Training 1/1 epoch (loss 2.7382): 52%|ββββββ | 130/250 [03:11<02:57, 1.48s/it]
Training 1/1 epoch (loss 2.7382): 52%|ββββββ | 131/250 [03:11<03:13, 1.62s/it]
Training 1/1 epoch (loss 2.6377): 52%|ββββββ | 131/250 [03:12<03:13, 1.62s/it]
Training 1/1 epoch (loss 2.6377): 53%|ββββββ | 132/250 [03:12<02:39, 1.35s/it]
Training 1/1 epoch (loss 2.7049): 53%|ββββββ | 132/250 [03:14<02:39, 1.35s/it]
Training 1/1 epoch (loss 2.7049): 53%|ββββββ | 133/250 [03:14<03:05, 1.59s/it]
Training 1/1 epoch (loss 2.8196): 53%|ββββββ | 133/250 [03:15<03:05, 1.59s/it]
Training 1/1 epoch (loss 2.8196): 54%|ββββββ | 134/250 [03:15<02:45, 1.43s/it]
Training 1/1 epoch (loss 2.8441): 54%|ββββββ | 134/250 [03:16<02:45, 1.43s/it]
Training 1/1 epoch (loss 2.8441): 54%|ββββββ | 135/250 [03:16<02:36, 1.36s/it]
Training 1/1 epoch (loss 3.0281): 54%|ββββββ | 135/250 [03:19<02:36, 1.36s/it]
Training 1/1 epoch (loss 3.0281): 54%|ββββββ | 136/250 [03:19<03:12, 1.69s/it]
Training 1/1 epoch (loss 2.7770): 54%|ββββββ | 136/250 [03:20<03:12, 1.69s/it]
Training 1/1 epoch (loss 2.7770): 55%|ββββββ | 137/250 [03:20<02:57, 1.58s/it]
Training 1/1 epoch (loss 2.7553): 55%|ββββββ | 137/250 [03:21<02:57, 1.58s/it]
Training 1/1 epoch (loss 2.7553): 55%|ββββββ | 138/250 [03:21<02:46, 1.48s/it]
Training 1/1 epoch (loss 2.6128): 55%|ββββββ | 138/250 [03:23<02:46, 1.48s/it]
Training 1/1 epoch (loss 2.6128): 56%|ββββββ | 139/250 [03:23<02:45, 1.49s/it]
Training 1/1 epoch (loss 3.0134): 56%|ββββββ | 139/250 [03:24<02:45, 1.49s/it]
Training 1/1 epoch (loss 3.0134): 56%|ββββββ | 140/250 [03:24<02:20, 1.28s/it]
Training 1/1 epoch (loss 2.5320): 56%|ββββββ | 140/250 [03:26<02:20, 1.28s/it]
Training 1/1 epoch (loss 2.5320): 56%|ββββββ | 141/250 [03:26<03:00, 1.65s/it]
Training 1/1 epoch (loss 2.8045): 56%|ββββββ | 141/250 [03:28<03:00, 1.65s/it]
Training 1/1 epoch (loss 2.8045): 57%|ββββββ | 142/250 [03:28<03:08, 1.75s/it]
Training 1/1 epoch (loss 2.7621): 57%|ββββββ | 142/250 [03:29<03:08, 1.75s/it]
Training 1/1 epoch (loss 2.7621): 57%|ββββββ | 143/250 [03:29<02:27, 1.38s/it]
Training 1/1 epoch (loss 2.7278): 57%|ββββββ | 143/250 [03:30<02:27, 1.38s/it]
Training 1/1 epoch (loss 2.7278): 58%|ββββββ | 144/250 [03:30<02:32, 1.44s/it]
Training 1/1 epoch (loss 2.5995): 58%|ββββββ | 144/250 [03:32<02:32, 1.44s/it]
Training 1/1 epoch (loss 2.5995): 58%|ββββββ | 145/250 [03:32<02:46, 1.58s/it]
Training 1/1 epoch (loss 2.7826): 58%|ββββββ | 145/250 [03:33<02:46, 1.58s/it]
Training 1/1 epoch (loss 2.7826): 58%|ββββββ | 146/250 [03:33<02:09, 1.25s/it]
Training 1/1 epoch (loss 2.8018): 58%|ββββββ | 146/250 [03:35<02:09, 1.25s/it]
Training 1/1 epoch (loss 2.8018): 59%|ββββββ | 147/250 [03:35<02:30, 1.46s/it]
Training 1/1 epoch (loss 2.8571): 59%|ββββββ | 147/250 [03:36<02:30, 1.46s/it]
Training 1/1 epoch (loss 2.8571): 59%|ββββββ | 148/250 [03:36<02:30, 1.48s/it]
Training 1/1 epoch (loss 2.8221): 59%|ββββββ | 148/250 [03:37<02:30, 1.48s/it]
Training 1/1 epoch (loss 2.8221): 60%|ββββββ | 149/250 [03:37<02:09, 1.28s/it]
Training 1/1 epoch (loss 2.8324): 60%|ββββββ | 149/250 [03:38<02:09, 1.28s/it]
Training 1/1 epoch (loss 2.8324): 60%|ββββββ | 150/250 [03:38<02:14, 1.34s/it]
Training 1/1 epoch (loss 2.5288): 60%|ββββββ | 150/250 [03:41<02:14, 1.34s/it]
Training 1/1 epoch (loss 2.5288): 60%|ββββββ | 151/250 [03:41<02:43, 1.65s/it]
Training 1/1 epoch (loss 2.8542): 60%|ββββββ | 151/250 [03:41<02:43, 1.65s/it]
Training 1/1 epoch (loss 2.8542): 61%|ββββββ | 152/250 [03:41<02:12, 1.36s/it]
Training 1/1 epoch (loss 2.6368): 61%|ββββββ | 152/250 [03:43<02:12, 1.36s/it]
Training 1/1 epoch (loss 2.6368): 61%|ββββββ | 153/250 [03:43<02:05, 1.30s/it]
Training 1/1 epoch (loss 2.7577): 61%|ββββββ | 153/250 [03:44<02:05, 1.30s/it]
Training 1/1 epoch (loss 2.7577): 62%|βββββββ | 154/250 [03:44<02:04, 1.30s/it]
Training 1/1 epoch (loss 2.8276): 62%|βββββββ | 154/250 [03:45<02:04, 1.30s/it]
Training 1/1 epoch (loss 2.8276): 62%|βββββββ | 155/250 [03:45<01:45, 1.11s/it]
Training 1/1 epoch (loss 2.8907): 62%|βββββββ | 155/250 [03:46<01:45, 1.11s/it]
Training 1/1 epoch (loss 2.8907): 62%|βββββββ | 156/250 [03:46<01:57, 1.25s/it]
Training 1/1 epoch (loss 2.7069): 62%|βββββββ | 156/250 [03:48<01:57, 1.25s/it]
Training 1/1 epoch (loss 2.7069): 63%|βββββββ | 157/250 [03:48<02:15, 1.46s/it]
Training 1/1 epoch (loss 2.6428): 63%|βββββββ | 157/250 [03:49<02:15, 1.46s/it]
Training 1/1 epoch (loss 2.6428): 63%|βββββββ | 158/250 [03:49<01:48, 1.18s/it]
Training 1/1 epoch (loss 2.6741): 63%|βββββββ | 158/250 [03:51<01:48, 1.18s/it]
Training 1/1 epoch (loss 2.6741): 64%|βββββββ | 159/250 [03:51<02:20, 1.55s/it]
Training 1/1 epoch (loss 2.5567): 64%|βββββββ | 159/250 [03:53<02:20, 1.55s/it]
Training 1/1 epoch (loss 2.5567): 64%|βββββββ | 160/250 [03:53<02:24, 1.61s/it]
Training 1/1 epoch (loss 2.7527): 64%|βββββββ | 160/250 [03:54<02:24, 1.61s/it]
Training 1/1 epoch (loss 2.7527): 64%|βββββββ | 161/250 [03:54<02:12, 1.48s/it]
Training 1/1 epoch (loss 2.6187): 64%|βββββββ | 161/250 [03:56<02:12, 1.48s/it]
Training 1/1 epoch (loss 2.6187): 65%|βββββββ | 162/250 [03:56<02:29, 1.70s/it]
Training 1/1 epoch (loss 2.6522): 65%|βββββββ | 162/250 [03:57<02:29, 1.70s/it]
Training 1/1 epoch (loss 2.6522): 65%|βββββββ | 163/250 [03:57<02:08, 1.48s/it]
Training 1/1 epoch (loss 2.6867): 65%|βββββββ | 163/250 [03:58<02:08, 1.48s/it]
Training 1/1 epoch (loss 2.6867): 66%|βββββββ | 164/250 [03:58<01:51, 1.30s/it]
Training 1/1 epoch (loss 2.5376): 66%|βββββββ | 164/250 [04:00<01:51, 1.30s/it]
Training 1/1 epoch (loss 2.5376): 66%|βββββββ | 165/250 [04:00<01:56, 1.37s/it]
Training 1/1 epoch (loss 2.4156): 66%|βββββββ | 165/250 [04:00<01:56, 1.37s/it]
Training 1/1 epoch (loss 2.4156): 66%|βββββββ | 166/250 [04:00<01:39, 1.18s/it]
Training 1/1 epoch (loss 2.7996): 66%|βββββββ | 166/250 [04:01<01:39, 1.18s/it]
Training 1/1 epoch (loss 2.7996): 67%|βββββββ | 167/250 [04:01<01:29, 1.08s/it]
Training 1/1 epoch (loss 2.9061): 67%|βββββββ | 167/250 [04:03<01:29, 1.08s/it]
Training 1/1 epoch (loss 2.9061): 67%|βββββββ | 168/250 [04:03<01:57, 1.44s/it]
Training 1/1 epoch (loss 2.7001): 67%|βββββββ | 168/250 [04:04<01:57, 1.44s/it]
Training 1/1 epoch (loss 2.7001): 68%|βββββββ | 169/250 [04:04<01:37, 1.21s/it]
Training 1/1 epoch (loss 2.8013): 68%|βββββββ | 169/250 [04:07<01:37, 1.21s/it]
Training 1/1 epoch (loss 2.8013): 68%|βββββββ | 170/250 [04:07<02:07, 1.59s/it]
Training 1/1 epoch (loss 2.7946): 68%|βββββββ | 170/250 [04:08<02:07, 1.59s/it]
Training 1/1 epoch (loss 2.7946): 68%|βββββββ | 171/250 [04:08<01:56, 1.48s/it]
Training 1/1 epoch (loss 2.5979): 68%|βββββββ | 171/250 [04:09<01:56, 1.48s/it]
Training 1/1 epoch (loss 2.5979): 69%|βββββββ | 172/250 [04:09<01:42, 1.32s/it]
Training 1/1 epoch (loss 2.6809): 69%|βββββββ | 172/250 [04:09<01:42, 1.32s/it]
Training 1/1 epoch (loss 2.6809): 69%|βββββββ | 173/250 [04:09<01:29, 1.16s/it]
Training 1/1 epoch (loss 2.8053): 69%|βββββββ | 173/250 [04:11<01:29, 1.16s/it]
Training 1/1 epoch (loss 2.8053): 70%|βββββββ | 174/250 [04:11<01:46, 1.40s/it]
Training 1/1 epoch (loss 2.8034): 70%|βββββββ | 174/250 [04:13<01:46, 1.40s/it]
Training 1/1 epoch (loss 2.8034): 70%|βββββββ | 175/250 [04:13<01:40, 1.34s/it]
Training 1/1 epoch (loss 2.7811): 70%|βββββββ | 175/250 [04:14<01:40, 1.34s/it]
Training 1/1 epoch (loss 2.7811): 70%|βββββββ | 176/250 [04:14<01:45, 1.43s/it]
Training 1/1 epoch (loss 2.8430): 70%|βββββββ | 176/250 [04:16<01:45, 1.43s/it]
Training 1/1 epoch (loss 2.8430): 71%|βββββββ | 177/250 [04:16<01:56, 1.59s/it]
Training 1/1 epoch (loss 2.9114): 71%|βββββββ | 177/250 [04:17<01:56, 1.59s/it]
Training 1/1 epoch (loss 2.9114): 71%|βββββββ | 178/250 [04:17<01:33, 1.30s/it]
Training 1/1 epoch (loss 2.8495): 71%|βββββββ | 178/250 [04:18<01:33, 1.30s/it]
Training 1/1 epoch (loss 2.8495): 72%|ββββββββ | 179/250 [04:18<01:34, 1.33s/it]
Training 1/1 epoch (loss 2.5811): 72%|ββββββββ | 179/250 [04:20<01:34, 1.33s/it]
Training 1/1 epoch (loss 2.5811): 72%|ββββββββ | 180/250 [04:20<01:34, 1.36s/it]
Training 1/1 epoch (loss 2.6840): 72%|ββββββββ | 180/250 [04:20<01:34, 1.36s/it]
Training 1/1 epoch (loss 2.6840): 72%|ββββββββ | 181/250 [04:20<01:16, 1.11s/it]
Training 1/1 epoch (loss 2.7319): 72%|ββββββββ | 181/250 [04:22<01:16, 1.11s/it]
Training 1/1 epoch (loss 2.7319): 73%|ββββββββ | 182/250 [04:22<01:38, 1.45s/it]
Training 1/1 epoch (loss 2.4945): 73%|ββββββββ | 182/250 [04:24<01:38, 1.45s/it]
Training 1/1 epoch (loss 2.4945): 73%|ββββββββ | 183/250 [04:24<01:37, 1.46s/it]
Training 1/1 epoch (loss 2.7082): 73%|ββββββββ | 183/250 [04:25<01:37, 1.46s/it]
Training 1/1 epoch (loss 2.7082): 74%|ββββββββ | 184/250 [04:25<01:22, 1.26s/it]
Training 1/1 epoch (loss 2.8083): 74%|ββββββββ | 184/250 [04:27<01:22, 1.26s/it]
Training 1/1 epoch (loss 2.8083): 74%|ββββββββ | 185/250 [04:27<01:35, 1.47s/it]
Training 1/1 epoch (loss 2.8050): 74%|ββββββββ | 185/250 [04:28<01:35, 1.47s/it]
Training 1/1 epoch (loss 2.8050): 74%|ββββββββ | 186/250 [04:28<01:32, 1.44s/it]
Training 1/1 epoch (loss 2.7737): 74%|ββββββββ | 186/250 [04:30<01:32, 1.44s/it]
Training 1/1 epoch (loss 2.7737): 75%|ββββββββ | 187/250 [04:30<01:42, 1.63s/it]
Training 1/1 epoch (loss 2.8894): 75%|ββββββββ | 187/250 [04:32<01:42, 1.63s/it]
Training 1/1 epoch (loss 2.8894): 75%|ββββββββ | 188/250 [04:32<01:42, 1.66s/it]
Training 1/1 epoch (loss 2.5773): 75%|ββββββββ | 188/250 [04:33<01:42, 1.66s/it]
Training 1/1 epoch (loss 2.5773): 76%|ββββββββ | 189/250 [04:33<01:25, 1.40s/it]
Training 1/1 epoch (loss 2.7231): 76%|ββββββββ | 189/250 [04:35<01:25, 1.40s/it]
Training 1/1 epoch (loss 2.7231): 76%|ββββββββ | 190/250 [04:35<01:43, 1.72s/it]
Training 1/1 epoch (loss 2.8657): 76%|ββββββββ | 190/250 [04:37<01:43, 1.72s/it]
Training 1/1 epoch (loss 2.8657): 76%|ββββββββ | 191/250 [04:37<01:38, 1.67s/it]
Training 1/1 epoch (loss 2.7695): 76%|ββββββββ | 191/250 [04:37<01:38, 1.67s/it]
Training 1/1 epoch (loss 2.7695): 77%|ββββββββ | 192/250 [04:37<01:20, 1.39s/it]
Training 1/1 epoch (loss 2.6054): 77%|ββββββββ | 192/250 [04:39<01:20, 1.39s/it]
Training 1/1 epoch (loss 2.6054): 77%|ββββββββ | 193/250 [04:39<01:28, 1.55s/it]
Training 1/1 epoch (loss 2.6963): 77%|ββββββββ | 193/250 [04:41<01:28, 1.55s/it]
Training 1/1 epoch (loss 2.6963): 78%|ββββββββ | 194/250 [04:41<01:31, 1.64s/it]
Training 1/1 epoch (loss 2.7515): 78%|ββββββββ | 194/250 [04:42<01:31, 1.64s/it]
Training 1/1 epoch (loss 2.7515): 78%|ββββββββ | 195/250 [04:42<01:21, 1.49s/it]
Training 1/1 epoch (loss 2.6130): 78%|ββββββββ | 195/250 [04:45<01:21, 1.49s/it]
Training 1/1 epoch (loss 2.6130): 78%|ββββββββ | 196/250 [04:45<01:35, 1.77s/it]
Training 1/1 epoch (loss 2.6528): 78%|ββββββββ | 196/250 [04:46<01:35, 1.77s/it]
Training 1/1 epoch (loss 2.6528): 79%|ββββββββ | 197/250 [04:46<01:27, 1.66s/it]
Training 1/1 epoch (loss 2.7072): 79%|ββββββββ | 197/250 [04:47<01:27, 1.66s/it]
Training 1/1 epoch (loss 2.7072): 79%|ββββββββ | 198/250 [04:47<01:12, 1.39s/it]
Training 1/1 epoch (loss 2.7420): 79%|ββββββββ | 198/250 [04:49<01:12, 1.39s/it]
Training 1/1 epoch (loss 2.7420): 80%|ββββββββ | 199/250 [04:49<01:23, 1.64s/it]
Training 1/1 epoch (loss 2.7201): 80%|ββββββββ | 199/250 [04:50<01:23, 1.64s/it]
Training 1/1 epoch (loss 2.7201): 80%|ββββββββ | 200/250 [04:50<01:13, 1.46s/it]
Training 1/1 epoch (loss 2.5861): 80%|ββββββββ | 200/250 [04:53<01:13, 1.46s/it]
Training 1/1 epoch (loss 2.5861): 80%|ββββββββ | 201/250 [04:53<01:25, 1.74s/it]
Training 1/1 epoch (loss 2.7598): 80%|ββββββββ | 201/250 [04:54<01:25, 1.74s/it]
Training 1/1 epoch (loss 2.7598): 81%|ββββββββ | 202/250 [04:54<01:21, 1.70s/it]
Training 1/1 epoch (loss 2.7075): 81%|ββββββββ | 202/250 [04:55<01:21, 1.70s/it]
Training 1/1 epoch (loss 2.7075): 81%|ββββββββ | 203/250 [04:55<01:03, 1.34s/it]
Training 1/1 epoch (loss 2.7569): 81%|ββββββββ | 203/250 [04:56<01:03, 1.34s/it]
Training 1/1 epoch (loss 2.7569): 82%|βββββββββ | 204/250 [04:56<01:02, 1.36s/it]
Training 1/1 epoch (loss 2.6823): 82%|βββββββββ | 204/250 [04:59<01:02, 1.36s/it]
Training 1/1 epoch (loss 2.6823): 82%|βββββββββ | 205/250 [04:59<01:16, 1.69s/it]
Training 1/1 epoch (loss 2.9427): 82%|βββββββββ | 205/250 [04:59<01:16, 1.69s/it]
Training 1/1 epoch (loss 2.9427): 82%|βββββββββ | 206/250 [04:59<00:57, 1.32s/it]
Training 1/1 epoch (loss 2.8157): 82%|βββββββββ | 206/250 [05:00<00:57, 1.32s/it]
Training 1/1 epoch (loss 2.8157): 83%|βββββββββ | 207/250 [05:00<00:58, 1.36s/it]
Training 1/1 epoch (loss 2.6426): 83%|βββββββββ | 207/250 [05:03<00:58, 1.36s/it]
Training 1/1 epoch (loss 2.6426): 83%|βββββββββ | 208/250 [05:03<01:13, 1.76s/it]
Training 1/1 epoch (loss 2.7528): 83%|βββββββββ | 208/250 [05:04<01:13, 1.76s/it]
Training 1/1 epoch (loss 2.7528): 84%|βββββββββ | 209/250 [05:04<00:59, 1.46s/it]
Training 1/1 epoch (loss 2.6254): 84%|βββββββββ | 209/250 [05:05<00:59, 1.46s/it]
Training 1/1 epoch (loss 2.6254): 84%|βββββββββ | 210/250 [05:05<00:58, 1.45s/it]
Training 1/1 epoch (loss 2.7336): 84%|βββββββββ | 210/250 [05:07<00:58, 1.45s/it]
Training 1/1 epoch (loss 2.7336): 84%|βββββββββ | 211/250 [05:07<00:57, 1.47s/it]
Training 1/1 epoch (loss 2.7969): 84%|βββββββββ | 211/250 [05:08<00:57, 1.47s/it]
Training 1/1 epoch (loss 2.7969): 85%|βββββββββ | 212/250 [05:08<00:51, 1.36s/it]
Training 1/1 epoch (loss 2.6675): 85%|βββββββββ | 212/250 [05:09<00:51, 1.36s/it]
Training 1/1 epoch (loss 2.6675): 85%|βββββββββ | 213/250 [05:09<00:46, 1.27s/it]
Training 1/1 epoch (loss 2.8788): 85%|βββββββββ | 213/250 [05:10<00:46, 1.27s/it]
Training 1/1 epoch (loss 2.8788): 86%|βββββββββ | 214/250 [05:10<00:41, 1.15s/it]
Training 1/1 epoch (loss 2.8369): 86%|βββββββββ | 214/250 [05:11<00:41, 1.15s/it]
Training 1/1 epoch (loss 2.8369): 86%|βββββββββ | 215/250 [05:11<00:39, 1.14s/it]
Training 1/1 epoch (loss 2.7131): 86%|βββββββββ | 215/250 [05:13<00:39, 1.14s/it]
Training 1/1 epoch (loss 2.7131): 86%|βββββββββ | 216/250 [05:13<00:50, 1.48s/it]
Training 1/1 epoch (loss 2.8857): 86%|βββββββββ | 216/250 [05:14<00:50, 1.48s/it]
Training 1/1 epoch (loss 2.8857): 87%|βββββββββ | 217/250 [05:14<00:43, 1.32s/it]
Training 1/1 epoch (loss 2.7630): 87%|βββββββββ | 217/250 [05:17<00:43, 1.32s/it]
Training 1/1 epoch (loss 2.7630): 87%|βββββββββ | 218/250 [05:17<00:53, 1.66s/it]
Training 1/1 epoch (loss 2.9010): 87%|βββββββββ | 218/250 [05:18<00:53, 1.66s/it]
Training 1/1 epoch (loss 2.9010): 88%|βββββββββ | 219/250 [05:18<00:50, 1.64s/it]
Training 1/1 epoch (loss 2.8852): 88%|βββββββββ | 219/250 [05:19<00:50, 1.64s/it]
Training 1/1 epoch (loss 2.8852): 88%|βββββββββ | 220/250 [05:19<00:40, 1.34s/it]
Training 1/1 epoch (loss 2.7409): 88%|βββββββββ | 220/250 [05:21<00:40, 1.34s/it]
Training 1/1 epoch (loss 2.7409): 88%|βββββββββ | 221/250 [05:21<00:48, 1.68s/it]
Training 1/1 epoch (loss 2.6669): 88%|βββββββββ | 221/250 [05:23<00:48, 1.68s/it]
Training 1/1 epoch (loss 2.6669): 89%|βββββββββ | 222/250 [05:23<00:50, 1.79s/it]
Training 1/1 epoch (loss 2.6248): 89%|βββββββββ | 222/250 [05:24<00:50, 1.79s/it]
Training 1/1 epoch (loss 2.6248): 89%|βββββββββ | 223/250 [05:24<00:41, 1.55s/it]
Training 1/1 epoch (loss 2.7160): 89%|βββββββββ | 223/250 [05:27<00:41, 1.55s/it]
Training 1/1 epoch (loss 2.7160): 90%|βββββββββ | 224/250 [05:27<00:48, 1.88s/it]
Training 1/1 epoch (loss 2.6894): 90%|βββββββββ | 224/250 [05:29<00:48, 1.88s/it]
Training 1/1 epoch (loss 2.6894): 90%|βββββββββ | 225/250 [05:29<00:44, 1.76s/it]
Training 1/1 epoch (loss 2.6485): 90%|βββββββββ | 225/250 [05:30<00:44, 1.76s/it]
Training 1/1 epoch (loss 2.6485): 90%|βββββββββ | 226/250 [05:30<00:41, 1.73s/it]
Training 1/1 epoch (loss 2.3350): 90%|βββββββββ | 226/250 [05:32<00:41, 1.73s/it]
Training 1/1 epoch (loss 2.3350): 91%|βββββββββ | 227/250 [05:32<00:39, 1.71s/it]
Training 1/1 epoch (loss 2.8237): 91%|βββββββββ | 227/250 [05:33<00:39, 1.71s/it]
Training 1/1 epoch (loss 2.8237): 91%|βββββββββ | 228/250 [05:33<00:30, 1.40s/it]
Training 1/1 epoch (loss 2.5592): 91%|βββββββββ | 228/250 [05:34<00:30, 1.40s/it]
Training 1/1 epoch (loss 2.5592): 92%|ββββββββββ| 229/250 [05:34<00:30, 1.45s/it]
Training 1/1 epoch (loss 2.9038): 92%|ββββββββββ| 229/250 [05:36<00:30, 1.45s/it]
Training 1/1 epoch (loss 2.9038): 92%|ββββββββββ| 230/250 [05:36<00:30, 1.52s/it]
Training 1/1 epoch (loss 2.8886): 92%|ββββββββββ| 230/250 [05:36<00:30, 1.52s/it]
Training 1/1 epoch (loss 2.8886): 92%|ββββββββββ| 231/250 [05:36<00:23, 1.25s/it]
Training 1/1 epoch (loss 2.6709): 92%|ββββββββββ| 231/250 [05:38<00:23, 1.25s/it]
Training 1/1 epoch (loss 2.6709): 93%|ββββββββββ| 232/250 [05:38<00:23, 1.28s/it]
Training 1/1 epoch (loss 2.6850): 93%|ββββββββββ| 232/250 [05:39<00:23, 1.28s/it]
Training 1/1 epoch (loss 2.6850): 93%|ββββββββββ| 233/250 [05:39<00:22, 1.34s/it]
Training 1/1 epoch (loss 2.7497): 93%|ββββββββββ| 233/250 [05:40<00:22, 1.34s/it]
Training 1/1 epoch (loss 2.7497): 94%|ββββββββββ| 234/250 [05:40<00:18, 1.14s/it]
Training 1/1 epoch (loss 2.5476): 94%|ββββββββββ| 234/250 [05:42<00:18, 1.14s/it]
Training 1/1 epoch (loss 2.5476): 94%|ββββββββββ| 235/250 [05:42<00:22, 1.53s/it]
Training 1/1 epoch (loss 2.6151): 94%|ββββββββββ| 235/250 [05:44<00:22, 1.53s/it]
Training 1/1 epoch (loss 2.6151): 94%|ββββββββββ| 236/250 [05:44<00:22, 1.64s/it]
Training 1/1 epoch (loss 2.6257): 94%|ββββββββββ| 236/250 [05:45<00:22, 1.64s/it]
Training 1/1 epoch (loss 2.6257): 95%|ββββββββββ| 237/250 [05:45<00:17, 1.38s/it]
Training 1/1 epoch (loss 2.5842): 95%|ββββββββββ| 237/250 [05:47<00:17, 1.38s/it]
Training 1/1 epoch (loss 2.5842): 95%|ββββββββββ| 238/250 [05:47<00:18, 1.51s/it]
Training 1/1 epoch (loss 2.6685): 95%|ββββββββββ| 238/250 [05:49<00:18, 1.51s/it]
Training 1/1 epoch (loss 2.6685): 96%|ββββββββββ| 239/250 [05:49<00:18, 1.71s/it]
Training 1/1 epoch (loss 2.4850): 96%|ββββββββββ| 239/250 [05:50<00:18, 1.71s/it]
Training 1/1 epoch (loss 2.4850): 96%|ββββββββββ| 240/250 [05:50<00:15, 1.58s/it]
Training 1/1 epoch (loss 2.7154): 96%|ββββββββββ| 240/250 [05:53<00:15, 1.58s/it]
Training 1/1 epoch (loss 2.7154): 96%|ββββββββββ| 241/250 [05:53<00:16, 1.82s/it]
Training 1/1 epoch (loss 2.6025): 96%|ββββββββββ| 241/250 [05:53<00:16, 1.82s/it]
Training 1/1 epoch (loss 2.6025): 97%|ββββββββββ| 242/250 [05:53<00:12, 1.52s/it]
Training 1/1 epoch (loss 2.7888): 97%|ββββββββββ| 242/250 [05:56<00:12, 1.52s/it]
Training 1/1 epoch (loss 2.7888): 97%|ββββββββββ| 243/250 [05:56<00:11, 1.69s/it]
Training 1/1 epoch (loss 2.6905): 97%|ββββββββββ| 243/250 [05:57<00:11, 1.69s/it]
Training 1/1 epoch (loss 2.6905): 98%|ββββββββββ| 244/250 [05:57<00:10, 1.74s/it]
Training 1/1 epoch (loss 2.7521): 98%|ββββββββββ| 244/250 [05:58<00:10, 1.74s/it]
Training 1/1 epoch (loss 2.7521): 98%|ββββββββββ| 245/250 [05:58<00:06, 1.40s/it]
Training 1/1 epoch (loss 2.7745): 98%|ββββββββββ| 245/250 [06:00<00:06, 1.40s/it]
Training 1/1 epoch (loss 2.7745): 98%|ββββββββββ| 246/250 [06:00<00:06, 1.67s/it]
Training 1/1 epoch (loss 2.6779): 98%|ββββββββββ| 246/250 [06:03<00:06, 1.67s/it]
Training 1/1 epoch (loss 2.6779): 99%|ββββββββββ| 247/250 [06:03<00:05, 1.83s/it]
Training 1/1 epoch (loss 2.7964): 99%|ββββββββββ| 247/250 [06:03<00:05, 1.83s/it]
Training 1/1 epoch (loss 2.7964): 99%|ββββββββββ| 248/250 [06:03<00:02, 1.41s/it]
Training 1/1 epoch (loss 2.6289): 99%|ββββββββββ| 248/250 [06:05<00:02, 1.41s/it]
Training 1/1 epoch (loss 2.6289): 100%|ββββββββββ| 249/250 [06:05<00:01, 1.49s/it]
Training 1/1 epoch (loss 2.7461): 100%|ββββββββββ| 249/250 [06:06<00:01, 1.49s/it]
Training 1/1 epoch (loss 2.7461): 100%|ββββββββββ| 250/250 [06:06<00:00, 1.31s/it]
Training 1/1 epoch (loss 2.7461): 100%|ββββββββββ| 250/250 [06:06<00:00, 1.46s/it] |
|
tokenizer config file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000-Q2-2000/tokenizer_config.json |
|
Special tokens file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000-Q2-2000/special_tokens_map.json |
|
wandb: ERROR Problem finishing run |
|
Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x15513e434c10>> |
|
Traceback (most recent call last): |
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper |
|
return func(*args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^ |
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close |
|
self.wandb.finish() |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 503, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 451, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2309, in finish |
|
return self._finish(exit_code) |
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2337, in _finish |
|
self._atexit_cleanup(exit_code=exit_code) |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2550, in _atexit_cleanup |
|
self._on_finish() |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2806, in _on_finish |
|
wait_with_progress( |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress |
|
return wait_all_with_progress( |
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress |
|
return asyncio_compat.run(progress_loop_with_timeout) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run |
|
future = executor.submit(runner.run, fn) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/concurrent/futures/thread.py", line 169, in submit |
|
raise RuntimeError( |
|
RuntimeError: cannot schedule new futures after interpreter shutdown |
|
|