|
+ deepspeed |
|
[rank1]:[W528 20:28:56.616013922 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank7]:[W528 20:28:56.707723026 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank2]:[W528 20:28:56.771028194 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank6]:[W528 20:28:56.792126207 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank3]:[W528 20:28:56.818517399 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank4]:[W528 20:28:56.824782463 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank0]:[W528 20:28:57.387286905 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank5]:[W528 20:28:57.394591796 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
|
Model config Qwen2Config { |
|
"_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B", |
|
"architectures": [ |
|
"Qwen2ForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 6912, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 21, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 20, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 20, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.49.0", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
Model config Qwen2Config { |
|
"_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B", |
|
"architectures": [ |
|
"Qwen2ForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 6912, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 21, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 20, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 20, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.49.0", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
Model config Qwen2Config { |
|
"_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B", |
|
"architectures": [ |
|
"Qwen2ForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 6912, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 21, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 20, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 20, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.49.0", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
Model config Qwen2Config { |
|
"_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B", |
|
"architectures": [ |
|
"Qwen2ForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 6912, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 21, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 20, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 20, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.49.0", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
Model config Qwen2Config { |
|
"_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B", |
|
"architectures": [ |
|
"Qwen2ForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 6912, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 21, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 20, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 20, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.49.0", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
Model config Qwen2Config { |
|
"_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B", |
|
"architectures": [ |
|
"Qwen2ForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 6912, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 21, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 20, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 20, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.49.0", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
Model config Qwen2Config { |
|
"_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B", |
|
"architectures": [ |
|
"Qwen2ForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 6912, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 21, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 20, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 20, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.49.0", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
Model config Qwen2Config { |
|
"_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B", |
|
"architectures": [ |
|
"Qwen2ForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 6912, |
|
"max_position_embeddings": 32768, |
|
"max_window_layers": 21, |
|
"model_type": "qwen2", |
|
"num_attention_heads": 20, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 20, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"sliding_window": 32768, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.49.0", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
|
loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
|
Will use torch_dtype=torch.bfloat16 as defined in model's config object |
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Will use torch_dtype=torch.bfloat16 as defined in model's config object |
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
|
loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
|
loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
|
Will use torch_dtype=torch.bfloat16 as defined in model's config object |
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
Will use torch_dtype=torch.bfloat16 as defined in model's config object |
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
Will use torch_dtype=torch.bfloat16 as defined in model's config object |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
|
Will use torch_dtype=torch.bfloat16 as defined in model's config object |
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
|
Will use torch_dtype=torch.bfloat16 as defined in model's config object |
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Will use torch_dtype=torch.bfloat16 as defined in model's config object |
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643 |
|
} |
|
|
|
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered. |
|
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered. |
|
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered. |
|
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered. |
|
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered. |
|
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered. |
|
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered. |
|
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered. |
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:01<00:01, 1.73s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:01<00:01, 1.73s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:01<00:01, 1.74s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:01<00:01, 1.75s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:01<00:01, 1.76s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:01<00:01, 1.76s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:01<00:01, 1.76s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:01<00:01, 1.76s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.57s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.57s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.59s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.59s/it] |
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.57s/it]All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.59s/it] |
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.57s/it]Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.60s/it] |
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.56s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.59s/it] |
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.57s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.60s/it] |
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
loading file vocab.json |
|
loading file merges.txt |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file vocab.json |
|
loading file merges.txt |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file vocab.json |
|
loading file merges.txt |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file vocab.json |
|
loading file merges.txt |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file vocab.json |
|
loading file merges.txt |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file vocab.json |
|
loading file merges.txt |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.58s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.61s/it] |
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
loading file vocab.json |
|
loading file merges.txt |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.76s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:03<00:00, 1.76s/it] |
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
|
Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151643, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
loading file vocab.json |
|
loading file merges.txt |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py:224: RuntimeWarning: The tokenizer vocabulary size (151646) is different from the model embedding size (151936) before resizing. |
|
resize_tokenizer_embedding(tokenizer=tokenizer, model=model) |
|
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
|
Detected CUDA files, patching ldflags |
|
Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... |
|
/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. |
|
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. |
|
warnings.warn( |
|
Building extension module fused_adam... |
|
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam...Loading extension module fused_adam...Loading extension module fused_adam...Loading extension module fused_adam...Loading extension module fused_adam... |
|
|
|
|
|
|
|
|
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login |
|
wandb: Tracking run with wandb version 0.19.8 |
|
wandb: Run data is saved locally in /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x |
|
wandb: Run `wandb offline` to turn off syncing. |
|
wandb: Syncing run qwen-4b-s3-Q1-10k |
|
wandb: βοΈ View project at https://wandb.ai/xtom/Inverse_Alignment |
|
wandb: π View run at https://wandb.ai/xtom/Inverse_Alignment/runs/skis4g6x |
|
Training 1/1 epoch: 0%| | 0/313 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
Training 1/1 epoch (loss 1.9969): 0%| | 0/313 [00:06<?, ?it/s]
Training 1/1 epoch (loss 1.9969): 0%| | 1/313 [00:06<34:02, 6.55s/it]
Training 1/1 epoch (loss 1.9435): 0%| | 1/313 [00:09<34:02, 6.55s/it]
Training 1/1 epoch (loss 1.9435): 1%| | 2/313 [00:09<21:36, 4.17s/it]
Training 1/1 epoch (loss 1.8559): 1%| | 2/313 [00:09<21:36, 4.17s/it]
Training 1/1 epoch (loss 1.8559): 1%| | 3/313 [00:09<13:11, 2.55s/it]
Training 1/1 epoch (loss 1.9193): 1%| | 3/313 [00:10<13:11, 2.55s/it]
Training 1/1 epoch (loss 1.9193): 1%|β | 4/313 [00:10<09:11, 1.79s/it]
Training 1/1 epoch (loss 1.9578): 1%|β | 4/313 [00:10<09:11, 1.79s/it]
Training 1/1 epoch (loss 1.9578): 2%|β | 5/313 [00:10<07:05, 1.38s/it]
Training 1/1 epoch (loss 1.9612): 2%|β | 5/313 [00:11<07:05, 1.38s/it]
Training 1/1 epoch (loss 1.9612): 2%|β | 6/313 [00:11<05:43, 1.12s/it]
Training 1/1 epoch (loss 1.9367): 2%|β | 6/313 [00:12<05:43, 1.12s/it]
Training 1/1 epoch (loss 1.9367): 2%|β | 7/313 [00:12<04:47, 1.07it/s]
Training 1/1 epoch (loss 1.8663): 2%|β | 7/313 [00:12<04:47, 1.07it/s]
Training 1/1 epoch (loss 1.8663): 3%|β | 8/313 [00:12<04:22, 1.16it/s]
Training 1/1 epoch (loss 1.8449): 3%|β | 8/313 [00:13<04:22, 1.16it/s]
Training 1/1 epoch (loss 1.8449): 3%|β | 9/313 [00:13<03:50, 1.32it/s]
Training 1/1 epoch (loss 1.7702): 3%|β | 9/313 [00:13<03:50, 1.32it/s]
Training 1/1 epoch (loss 1.7702): 3%|β | 10/313 [00:13<03:25, 1.47it/s]
Training 1/1 epoch (loss 1.7934): 3%|β | 10/313 [00:14<03:25, 1.47it/s]
Training 1/1 epoch (loss 1.7934): 4%|β | 11/313 [00:14<03:08, 1.60it/s]
Training 1/1 epoch (loss 1.6283): 4%|β | 11/313 [00:14<03:08, 1.60it/s]
Training 1/1 epoch (loss 1.6283): 4%|β | 12/313 [00:14<02:56, 1.70it/s]
Training 1/1 epoch (loss 1.7163): 4%|β | 12/313 [00:15<02:56, 1.70it/s]
Training 1/1 epoch (loss 1.7163): 4%|β | 13/313 [00:15<02:48, 1.78it/s]
Training 1/1 epoch (loss 1.8233): 4%|β | 13/313 [00:15<02:48, 1.78it/s]
Training 1/1 epoch (loss 1.8233): 4%|β | 14/313 [00:15<02:41, 1.85it/s]
Training 1/1 epoch (loss 1.8071): 4%|β | 14/313 [00:16<02:41, 1.85it/s]
Training 1/1 epoch (loss 1.8071): 5%|β | 15/313 [00:16<02:36, 1.91it/s]
Training 1/1 epoch (loss 1.7738): 5%|β | 15/313 [00:16<02:36, 1.91it/s]
Training 1/1 epoch (loss 1.7738): 5%|β | 16/313 [00:16<02:38, 1.88it/s]
Training 1/1 epoch (loss 1.6783): 5%|β | 16/313 [00:17<02:38, 1.88it/s]
Training 1/1 epoch (loss 1.6783): 5%|β | 17/313 [00:17<02:36, 1.89it/s]
Training 1/1 epoch (loss 1.6430): 5%|β | 17/313 [00:17<02:36, 1.89it/s]
Training 1/1 epoch (loss 1.6430): 6%|β | 18/313 [00:17<02:31, 1.94it/s]
Training 1/1 epoch (loss 1.6190): 6%|β | 18/313 [00:18<02:31, 1.94it/s]
Training 1/1 epoch (loss 1.6190): 6%|β | 19/313 [00:18<02:29, 1.97it/s]
Training 1/1 epoch (loss 1.7658): 6%|β | 19/313 [00:18<02:29, 1.97it/s]
Training 1/1 epoch (loss 1.7658): 6%|β | 20/313 [00:18<02:31, 1.94it/s]
Training 1/1 epoch (loss 1.5424): 6%|β | 20/313 [00:19<02:31, 1.94it/s]
Training 1/1 epoch (loss 1.5424): 7%|β | 21/313 [00:19<02:28, 1.96it/s]
Training 1/1 epoch (loss 1.7133): 7%|β | 21/313 [00:19<02:28, 1.96it/s]
Training 1/1 epoch (loss 1.7133): 7%|β | 22/313 [00:19<02:27, 1.97it/s]
Training 1/1 epoch (loss 1.6683): 7%|β | 22/313 [00:20<02:27, 1.97it/s]
Training 1/1 epoch (loss 1.6683): 7%|β | 23/313 [00:20<02:25, 1.99it/s]
Training 1/1 epoch (loss 1.5875): 7%|β | 23/313 [00:20<02:25, 1.99it/s]
Training 1/1 epoch (loss 1.5875): 8%|β | 24/313 [00:20<02:28, 1.95it/s]
Training 1/1 epoch (loss 1.6542): 8%|β | 24/313 [00:21<02:28, 1.95it/s]
Training 1/1 epoch (loss 1.6542): 8%|β | 25/313 [00:21<02:26, 1.96it/s]
Training 1/1 epoch (loss 1.5447): 8%|β | 25/313 [00:21<02:26, 1.96it/s]
Training 1/1 epoch (loss 1.5447): 8%|β | 26/313 [00:21<02:24, 1.99it/s]
Training 1/1 epoch (loss 1.5617): 8%|β | 26/313 [00:22<02:24, 1.99it/s]
Training 1/1 epoch (loss 1.5617): 9%|β | 27/313 [00:22<02:22, 2.01it/s]
Training 1/1 epoch (loss 1.6580): 9%|β | 27/313 [00:22<02:22, 2.01it/s]
Training 1/1 epoch (loss 1.6580): 9%|β | 28/313 [00:22<02:23, 1.99it/s]
Training 1/1 epoch (loss 1.7572): 9%|β | 28/313 [00:23<02:23, 1.99it/s]
Training 1/1 epoch (loss 1.7572): 9%|β | 29/313 [00:23<02:23, 1.98it/s]
Training 1/1 epoch (loss 1.6233): 9%|β | 29/313 [00:23<02:23, 1.98it/s]
Training 1/1 epoch (loss 1.6233): 10%|β | 30/313 [00:23<02:22, 1.99it/s]
Training 1/1 epoch (loss 1.5753): 10%|β | 30/313 [00:24<02:22, 1.99it/s]
Training 1/1 epoch (loss 1.5753): 10%|β | 31/313 [00:24<02:22, 1.98it/s]
Training 1/1 epoch (loss 1.6465): 10%|β | 31/313 [00:25<02:22, 1.98it/s]
Training 1/1 epoch (loss 1.6465): 10%|β | 32/313 [00:25<02:32, 1.84it/s]
Training 1/1 epoch (loss 1.5676): 10%|β | 32/313 [00:25<02:32, 1.84it/s]
Training 1/1 epoch (loss 1.5676): 11%|β | 33/313 [00:25<02:36, 1.78it/s]
Training 1/1 epoch (loss 1.6156): 11%|β | 33/313 [00:26<02:36, 1.78it/s]
Training 1/1 epoch (loss 1.6156): 11%|β | 34/313 [00:26<02:36, 1.78it/s]
Training 1/1 epoch (loss 1.6045): 11%|β | 34/313 [00:26<02:36, 1.78it/s]
Training 1/1 epoch (loss 1.6045): 11%|β | 35/313 [00:26<02:33, 1.81it/s]
Training 1/1 epoch (loss 1.6350): 11%|β | 35/313 [00:27<02:33, 1.81it/s]
Training 1/1 epoch (loss 1.6350): 12%|ββ | 36/313 [00:27<02:30, 1.83it/s]
Training 1/1 epoch (loss 1.6478): 12%|ββ | 36/313 [00:27<02:30, 1.83it/s]
Training 1/1 epoch (loss 1.6478): 12%|ββ | 37/313 [00:27<02:25, 1.89it/s]
Training 1/1 epoch (loss 1.6211): 12%|ββ | 37/313 [00:28<02:25, 1.89it/s]
Training 1/1 epoch (loss 1.6211): 12%|ββ | 38/313 [00:28<02:21, 1.94it/s]
Training 1/1 epoch (loss 1.5940): 12%|ββ | 38/313 [00:28<02:21, 1.94it/s]
Training 1/1 epoch (loss 1.5940): 12%|ββ | 39/313 [00:28<02:21, 1.93it/s]
Training 1/1 epoch (loss 1.6426): 12%|ββ | 39/313 [00:29<02:21, 1.93it/s]
Training 1/1 epoch (loss 1.6426): 13%|ββ | 40/313 [00:29<02:27, 1.85it/s]
Training 1/1 epoch (loss 1.6020): 13%|ββ | 40/313 [00:29<02:27, 1.85it/s]
Training 1/1 epoch (loss 1.6020): 13%|ββ | 41/313 [00:29<02:22, 1.91it/s]
Training 1/1 epoch (loss 1.6420): 13%|ββ | 41/313 [00:30<02:22, 1.91it/s]
Training 1/1 epoch (loss 1.6420): 13%|ββ | 42/313 [00:30<02:20, 1.93it/s]
Training 1/1 epoch (loss 1.5219): 13%|ββ | 42/313 [00:30<02:20, 1.93it/s]
Training 1/1 epoch (loss 1.5219): 14%|ββ | 43/313 [00:30<02:18, 1.95it/s]
Training 1/1 epoch (loss 1.6230): 14%|ββ | 43/313 [00:31<02:18, 1.95it/s]
Training 1/1 epoch (loss 1.6230): 14%|ββ | 44/313 [00:31<02:17, 1.96it/s]
Training 1/1 epoch (loss 1.6545): 14%|ββ | 44/313 [00:32<02:17, 1.96it/s]
Training 1/1 epoch (loss 1.6545): 14%|ββ | 45/313 [00:32<02:23, 1.86it/s]
Training 1/1 epoch (loss 1.5977): 14%|ββ | 45/313 [00:32<02:23, 1.86it/s]
Training 1/1 epoch (loss 1.5977): 15%|ββ | 46/313 [00:32<02:19, 1.91it/s]
Training 1/1 epoch (loss 1.6383): 15%|ββ | 46/313 [00:33<02:19, 1.91it/s]
Training 1/1 epoch (loss 1.6383): 15%|ββ | 47/313 [00:33<02:19, 1.91it/s]
Training 1/1 epoch (loss 1.6238): 15%|ββ | 47/313 [00:33<02:19, 1.91it/s]
Training 1/1 epoch (loss 1.6238): 15%|ββ | 48/313 [00:33<02:20, 1.89it/s]
Training 1/1 epoch (loss 1.5379): 15%|ββ | 48/313 [00:34<02:20, 1.89it/s]
Training 1/1 epoch (loss 1.5379): 16%|ββ | 49/313 [00:34<02:21, 1.86it/s]
Training 1/1 epoch (loss 1.5923): 16%|ββ | 49/313 [00:34<02:21, 1.86it/s]
Training 1/1 epoch (loss 1.5923): 16%|ββ | 50/313 [00:34<02:25, 1.81it/s]
Training 1/1 epoch (loss 1.7624): 16%|ββ | 50/313 [00:35<02:25, 1.81it/s]
Training 1/1 epoch (loss 1.7624): 16%|ββ | 51/313 [00:35<02:29, 1.76it/s]
Training 1/1 epoch (loss 1.7058): 16%|ββ | 51/313 [00:35<02:29, 1.76it/s]
Training 1/1 epoch (loss 1.7058): 17%|ββ | 52/313 [00:35<02:34, 1.69it/s]
Training 1/1 epoch (loss 1.5322): 17%|ββ | 52/313 [00:36<02:34, 1.69it/s]
Training 1/1 epoch (loss 1.5322): 17%|ββ | 53/313 [00:36<02:34, 1.68it/s]
Training 1/1 epoch (loss 1.6093): 17%|ββ | 53/313 [00:37<02:34, 1.68it/s]
Training 1/1 epoch (loss 1.6093): 17%|ββ | 54/313 [00:37<02:37, 1.65it/s]
Training 1/1 epoch (loss 1.6716): 17%|ββ | 54/313 [00:37<02:37, 1.65it/s]
Training 1/1 epoch (loss 1.6716): 18%|ββ | 55/313 [00:37<02:35, 1.66it/s]
Training 1/1 epoch (loss 1.5030): 18%|ββ | 55/313 [00:38<02:35, 1.66it/s]
Training 1/1 epoch (loss 1.5030): 18%|ββ | 56/313 [00:38<02:36, 1.64it/s]
Training 1/1 epoch (loss 1.6169): 18%|ββ | 56/313 [00:38<02:36, 1.64it/s]
Training 1/1 epoch (loss 1.6169): 18%|ββ | 57/313 [00:38<02:32, 1.67it/s]
Training 1/1 epoch (loss 1.4965): 18%|ββ | 57/313 [00:39<02:32, 1.67it/s]
Training 1/1 epoch (loss 1.4965): 19%|ββ | 58/313 [00:39<02:39, 1.60it/s]
Training 1/1 epoch (loss 1.5634): 19%|ββ | 58/313 [00:40<02:39, 1.60it/s]
Training 1/1 epoch (loss 1.5634): 19%|ββ | 59/313 [00:40<02:34, 1.65it/s]
Training 1/1 epoch (loss 1.6295): 19%|ββ | 59/313 [00:40<02:34, 1.65it/s]
Training 1/1 epoch (loss 1.6295): 19%|ββ | 60/313 [00:40<02:32, 1.66it/s]
Training 1/1 epoch (loss 1.5314): 19%|ββ | 60/313 [00:41<02:32, 1.66it/s]
Training 1/1 epoch (loss 1.5314): 19%|ββ | 61/313 [00:41<02:30, 1.67it/s]
Training 1/1 epoch (loss 1.5546): 19%|ββ | 61/313 [00:42<02:30, 1.67it/s]
Training 1/1 epoch (loss 1.5546): 20%|ββ | 62/313 [00:42<02:29, 1.68it/s]
Training 1/1 epoch (loss 1.6915): 20%|ββ | 62/313 [00:42<02:29, 1.68it/s]
Training 1/1 epoch (loss 1.6915): 20%|ββ | 63/313 [00:42<02:30, 1.66it/s]
Training 1/1 epoch (loss 1.5631): 20%|ββ | 63/313 [00:43<02:30, 1.66it/s]
Training 1/1 epoch (loss 1.5631): 20%|ββ | 64/313 [00:43<02:36, 1.59it/s]
Training 1/1 epoch (loss 1.6443): 20%|ββ | 64/313 [00:43<02:36, 1.59it/s]
Training 1/1 epoch (loss 1.6443): 21%|ββ | 65/313 [00:43<02:33, 1.62it/s]
Training 1/1 epoch (loss 1.6063): 21%|ββ | 65/313 [00:44<02:33, 1.62it/s]
Training 1/1 epoch (loss 1.6063): 21%|ββ | 66/313 [00:44<02:29, 1.65it/s]
Training 1/1 epoch (loss 1.6434): 21%|ββ | 66/313 [00:45<02:29, 1.65it/s]
Training 1/1 epoch (loss 1.6434): 21%|βββ | 67/313 [00:45<02:28, 1.66it/s]
Training 1/1 epoch (loss 1.4765): 21%|βββ | 67/313 [00:45<02:28, 1.66it/s]
Training 1/1 epoch (loss 1.4765): 22%|βββ | 68/313 [00:45<02:24, 1.69it/s]
Training 1/1 epoch (loss 1.5883): 22%|βββ | 68/313 [00:46<02:24, 1.69it/s]
Training 1/1 epoch (loss 1.5883): 22%|βββ | 69/313 [00:46<02:22, 1.72it/s]
Training 1/1 epoch (loss 1.4743): 22%|βββ | 69/313 [00:46<02:22, 1.72it/s]
Training 1/1 epoch (loss 1.4743): 22%|βββ | 70/313 [00:46<02:21, 1.72it/s]
Training 1/1 epoch (loss 1.4694): 22%|βββ | 70/313 [00:47<02:21, 1.72it/s]
Training 1/1 epoch (loss 1.4694): 23%|βββ | 71/313 [00:47<02:20, 1.72it/s]
Training 1/1 epoch (loss 1.5781): 23%|βββ | 71/313 [00:48<02:20, 1.72it/s]
Training 1/1 epoch (loss 1.5781): 23%|βββ | 72/313 [00:48<02:24, 1.67it/s]
Training 1/1 epoch (loss 1.6075): 23%|βββ | 72/313 [00:48<02:24, 1.67it/s]
Training 1/1 epoch (loss 1.6075): 23%|βββ | 73/313 [00:48<02:27, 1.63it/s]
Training 1/1 epoch (loss 1.4934): 23%|βββ | 73/313 [00:49<02:27, 1.63it/s]
Training 1/1 epoch (loss 1.4934): 24%|βββ | 74/313 [00:49<02:29, 1.60it/s]
Training 1/1 epoch (loss 1.5534): 24%|βββ | 74/313 [00:49<02:29, 1.60it/s]
Training 1/1 epoch (loss 1.5534): 24%|βββ | 75/313 [00:49<02:25, 1.63it/s]
Training 1/1 epoch (loss 1.4987): 24%|βββ | 75/313 [00:50<02:25, 1.63it/s]
Training 1/1 epoch (loss 1.4987): 24%|βββ | 76/313 [00:50<02:23, 1.65it/s]
Training 1/1 epoch (loss 1.6022): 24%|βββ | 76/313 [00:51<02:23, 1.65it/s]
Training 1/1 epoch (loss 1.6022): 25%|βββ | 77/313 [00:51<02:21, 1.67it/s]
Training 1/1 epoch (loss 1.6577): 25%|βββ | 77/313 [00:51<02:21, 1.67it/s]
Training 1/1 epoch (loss 1.6577): 25%|βββ | 78/313 [00:51<02:18, 1.70it/s]
Training 1/1 epoch (loss 1.4478): 25%|βββ | 78/313 [00:52<02:18, 1.70it/s]
Training 1/1 epoch (loss 1.4478): 25%|βββ | 79/313 [00:52<02:18, 1.69it/s]
Training 1/1 epoch (loss 1.5968): 25%|βββ | 79/313 [00:52<02:18, 1.69it/s]
Training 1/1 epoch (loss 1.5968): 26%|βββ | 80/313 [00:52<02:22, 1.64it/s]
Training 1/1 epoch (loss 1.5386): 26%|βββ | 80/313 [00:53<02:22, 1.64it/s]
Training 1/1 epoch (loss 1.5386): 26%|βββ | 81/313 [00:53<02:20, 1.65it/s]
Training 1/1 epoch (loss 1.5288): 26%|βββ | 81/313 [00:54<02:20, 1.65it/s]
Training 1/1 epoch (loss 1.5288): 26%|βββ | 82/313 [00:54<02:16, 1.69it/s]
Training 1/1 epoch (loss 1.5583): 26%|βββ | 82/313 [00:54<02:16, 1.69it/s]
Training 1/1 epoch (loss 1.5583): 27%|βββ | 83/313 [00:54<02:09, 1.78it/s]
Training 1/1 epoch (loss 1.5892): 27%|βββ | 83/313 [00:55<02:09, 1.78it/s]
Training 1/1 epoch (loss 1.5892): 27%|βββ | 84/313 [00:55<02:04, 1.84it/s]
Training 1/1 epoch (loss 1.4888): 27%|βββ | 84/313 [00:55<02:04, 1.84it/s]
Training 1/1 epoch (loss 1.4888): 27%|βββ | 85/313 [00:55<02:03, 1.85it/s]
Training 1/1 epoch (loss 1.4864): 27%|βββ | 85/313 [00:56<02:03, 1.85it/s]
Training 1/1 epoch (loss 1.4864): 27%|βββ | 86/313 [00:56<02:01, 1.87it/s]
Training 1/1 epoch (loss 1.5522): 27%|βββ | 86/313 [00:56<02:01, 1.87it/s]
Training 1/1 epoch (loss 1.5522): 28%|βββ | 87/313 [00:56<01:56, 1.93it/s]
Training 1/1 epoch (loss 1.5104): 28%|βββ | 87/313 [00:57<01:56, 1.93it/s]
Training 1/1 epoch (loss 1.5104): 28%|βββ | 88/313 [00:57<01:59, 1.88it/s]
Training 1/1 epoch (loss 1.5643): 28%|βββ | 88/313 [00:57<01:59, 1.88it/s]
Training 1/1 epoch (loss 1.5643): 28%|βββ | 89/313 [00:57<01:56, 1.93it/s]
Training 1/1 epoch (loss 1.5999): 28%|βββ | 89/313 [00:58<01:56, 1.93it/s]
Training 1/1 epoch (loss 1.5999): 29%|βββ | 90/313 [00:58<01:54, 1.95it/s]
Training 1/1 epoch (loss 1.5838): 29%|βββ | 90/313 [00:58<01:54, 1.95it/s]
Training 1/1 epoch (loss 1.5838): 29%|βββ | 91/313 [00:58<01:53, 1.96it/s]
Training 1/1 epoch (loss 1.5707): 29%|βββ | 91/313 [00:59<01:53, 1.96it/s]
Training 1/1 epoch (loss 1.5707): 29%|βββ | 92/313 [00:59<01:53, 1.95it/s]
Training 1/1 epoch (loss 1.5591): 29%|βββ | 92/313 [00:59<01:53, 1.95it/s]
Training 1/1 epoch (loss 1.5591): 30%|βββ | 93/313 [00:59<01:57, 1.87it/s]
Training 1/1 epoch (loss 1.4692): 30%|βββ | 93/313 [01:00<01:57, 1.87it/s]
Training 1/1 epoch (loss 1.4692): 30%|βββ | 94/313 [01:00<02:04, 1.77it/s]
Training 1/1 epoch (loss 1.6833): 30%|βββ | 94/313 [01:00<02:04, 1.77it/s]
Training 1/1 epoch (loss 1.6833): 30%|βββ | 95/313 [01:00<02:06, 1.73it/s]
Training 1/1 epoch (loss 1.5013): 30%|βββ | 95/313 [01:01<02:06, 1.73it/s]
Training 1/1 epoch (loss 1.5013): 31%|βββ | 96/313 [01:01<02:09, 1.68it/s]
Training 1/1 epoch (loss 1.3842): 31%|βββ | 96/313 [01:02<02:09, 1.68it/s]
Training 1/1 epoch (loss 1.3842): 31%|βββ | 97/313 [01:02<02:07, 1.70it/s]
Training 1/1 epoch (loss 1.5485): 31%|βββ | 97/313 [01:02<02:07, 1.70it/s]
Training 1/1 epoch (loss 1.5485): 31%|ββββ | 98/313 [01:02<02:04, 1.73it/s]
Training 1/1 epoch (loss 1.6869): 31%|ββββ | 98/313 [01:03<02:04, 1.73it/s]
Training 1/1 epoch (loss 1.6869): 32%|ββββ | 99/313 [01:03<02:04, 1.73it/s]
Training 1/1 epoch (loss 1.4729): 32%|ββββ | 99/313 [01:03<02:04, 1.73it/s]
Training 1/1 epoch (loss 1.4729): 32%|ββββ | 100/313 [01:03<02:01, 1.75it/s]
Training 1/1 epoch (loss 1.5249): 32%|ββββ | 100/313 [01:04<02:01, 1.75it/s]
Training 1/1 epoch (loss 1.5249): 32%|ββββ | 101/313 [01:04<02:01, 1.74it/s]
Training 1/1 epoch (loss 1.6004): 32%|ββββ | 101/313 [01:05<02:01, 1.74it/s]
Training 1/1 epoch (loss 1.6004): 33%|ββββ | 102/313 [01:05<02:02, 1.72it/s]
Training 1/1 epoch (loss 1.4560): 33%|ββββ | 102/313 [01:05<02:02, 1.72it/s]
Training 1/1 epoch (loss 1.4560): 33%|ββββ | 103/313 [01:05<02:00, 1.74it/s]
Training 1/1 epoch (loss 1.5102): 33%|ββββ | 103/313 [01:06<02:00, 1.74it/s]
Training 1/1 epoch (loss 1.5102): 33%|ββββ | 104/313 [01:06<01:57, 1.77it/s]
Training 1/1 epoch (loss 1.6432): 33%|ββββ | 104/313 [01:06<01:57, 1.77it/s]
Training 1/1 epoch (loss 1.6432): 34%|ββββ | 105/313 [01:06<01:53, 1.84it/s]
Training 1/1 epoch (loss 1.6845): 34%|ββββ | 105/313 [01:07<01:53, 1.84it/s]
Training 1/1 epoch (loss 1.6845): 34%|ββββ | 106/313 [01:07<01:51, 1.86it/s]
Training 1/1 epoch (loss 1.5082): 34%|ββββ | 106/313 [01:07<01:51, 1.86it/s]
Training 1/1 epoch (loss 1.5082): 34%|ββββ | 107/313 [01:07<01:48, 1.89it/s]
Training 1/1 epoch (loss 1.5530): 34%|ββββ | 107/313 [01:08<01:48, 1.89it/s]
Training 1/1 epoch (loss 1.5530): 35%|ββββ | 108/313 [01:08<01:45, 1.94it/s]
Training 1/1 epoch (loss 1.5151): 35%|ββββ | 108/313 [01:08<01:45, 1.94it/s]
Training 1/1 epoch (loss 1.5151): 35%|ββββ | 109/313 [01:08<01:43, 1.97it/s]
Training 1/1 epoch (loss 1.5404): 35%|ββββ | 109/313 [01:09<01:43, 1.97it/s]
Training 1/1 epoch (loss 1.5404): 35%|ββββ | 110/313 [01:09<01:43, 1.97it/s]
Training 1/1 epoch (loss 1.5282): 35%|ββββ | 110/313 [01:09<01:43, 1.97it/s]
Training 1/1 epoch (loss 1.5282): 35%|ββββ | 111/313 [01:09<01:51, 1.82it/s]
Training 1/1 epoch (loss 1.5188): 35%|ββββ | 111/313 [01:10<01:51, 1.82it/s]
Training 1/1 epoch (loss 1.5188): 36%|ββββ | 112/313 [01:10<01:55, 1.74it/s]
Training 1/1 epoch (loss 1.4890): 36%|ββββ | 112/313 [01:11<01:55, 1.74it/s]
Training 1/1 epoch (loss 1.4890): 36%|ββββ | 113/313 [01:11<01:55, 1.73it/s]
Training 1/1 epoch (loss 1.5756): 36%|ββββ | 113/313 [01:11<01:55, 1.73it/s]
Training 1/1 epoch (loss 1.5756): 36%|ββββ | 114/313 [01:11<01:51, 1.79it/s]
Training 1/1 epoch (loss 1.4864): 36%|ββββ | 114/313 [01:12<01:51, 1.79it/s]
Training 1/1 epoch (loss 1.4864): 37%|ββββ | 115/313 [01:12<01:47, 1.84it/s]
Training 1/1 epoch (loss 1.6112): 37%|ββββ | 115/313 [01:12<01:47, 1.84it/s]
Training 1/1 epoch (loss 1.6112): 37%|ββββ | 116/313 [01:12<01:44, 1.88it/s]
Training 1/1 epoch (loss 1.6032): 37%|ββββ | 116/313 [01:13<01:44, 1.88it/s]
Training 1/1 epoch (loss 1.6032): 37%|ββββ | 117/313 [01:13<01:42, 1.91it/s]
Training 1/1 epoch (loss 1.5925): 37%|ββββ | 117/313 [01:13<01:42, 1.91it/s]
Training 1/1 epoch (loss 1.5925): 38%|ββββ | 118/313 [01:13<01:41, 1.92it/s]
Training 1/1 epoch (loss 1.5946): 38%|ββββ | 118/313 [01:14<01:41, 1.92it/s]
Training 1/1 epoch (loss 1.5946): 38%|ββββ | 119/313 [01:14<01:40, 1.94it/s]
Training 1/1 epoch (loss 1.5030): 38%|ββββ | 119/313 [01:14<01:40, 1.94it/s]
Training 1/1 epoch (loss 1.5030): 38%|ββββ | 120/313 [01:14<01:43, 1.87it/s]
Training 1/1 epoch (loss 1.6072): 38%|ββββ | 120/313 [01:15<01:43, 1.87it/s]
Training 1/1 epoch (loss 1.6072): 39%|ββββ | 121/313 [01:15<01:42, 1.88it/s]
Training 1/1 epoch (loss 1.6167): 39%|ββββ | 121/313 [01:15<01:42, 1.88it/s]
Training 1/1 epoch (loss 1.6167): 39%|ββββ | 122/313 [01:15<01:40, 1.90it/s]
Training 1/1 epoch (loss 1.3506): 39%|ββββ | 122/313 [01:16<01:40, 1.90it/s]
Training 1/1 epoch (loss 1.3506): 39%|ββββ | 123/313 [01:16<01:37, 1.94it/s]
Training 1/1 epoch (loss 1.6567): 39%|ββββ | 123/313 [01:16<01:37, 1.94it/s]
Training 1/1 epoch (loss 1.6567): 40%|ββββ | 124/313 [01:16<01:36, 1.97it/s]
Training 1/1 epoch (loss 1.6780): 40%|ββββ | 124/313 [01:17<01:36, 1.97it/s]
Training 1/1 epoch (loss 1.6780): 40%|ββββ | 125/313 [01:17<01:35, 1.96it/s]
Training 1/1 epoch (loss 1.6126): 40%|ββββ | 125/313 [01:17<01:35, 1.96it/s]
Training 1/1 epoch (loss 1.6126): 40%|ββββ | 126/313 [01:17<01:35, 1.95it/s]
Training 1/1 epoch (loss 1.5259): 40%|ββββ | 126/313 [01:18<01:35, 1.95it/s]
Training 1/1 epoch (loss 1.5259): 41%|ββββ | 127/313 [01:18<01:34, 1.97it/s]
Training 1/1 epoch (loss 1.4742): 41%|ββββ | 127/313 [01:18<01:34, 1.97it/s]
Training 1/1 epoch (loss 1.4742): 41%|ββββ | 128/313 [01:18<01:36, 1.92it/s]
Training 1/1 epoch (loss 1.5650): 41%|ββββ | 128/313 [01:19<01:36, 1.92it/s]
Training 1/1 epoch (loss 1.5650): 41%|ββββ | 129/313 [01:19<01:36, 1.90it/s]
Training 1/1 epoch (loss 1.5327): 41%|ββββ | 129/313 [01:19<01:36, 1.90it/s]
Training 1/1 epoch (loss 1.5327): 42%|βββββ | 130/313 [01:19<01:34, 1.93it/s]
Training 1/1 epoch (loss 1.5616): 42%|βββββ | 130/313 [01:20<01:34, 1.93it/s]
Training 1/1 epoch (loss 1.5616): 42%|βββββ | 131/313 [01:20<01:33, 1.95it/s]
Training 1/1 epoch (loss 1.5460): 42%|βββββ | 131/313 [01:20<01:33, 1.95it/s]
Training 1/1 epoch (loss 1.5460): 42%|βββββ | 132/313 [01:20<01:34, 1.92it/s]
Training 1/1 epoch (loss 1.5448): 42%|βββββ | 132/313 [01:21<01:34, 1.92it/s]
Training 1/1 epoch (loss 1.5448): 42%|βββββ | 133/313 [01:21<01:33, 1.93it/s]
Training 1/1 epoch (loss 1.5418): 42%|βββββ | 133/313 [01:21<01:33, 1.93it/s]
Training 1/1 epoch (loss 1.5418): 43%|βββββ | 134/313 [01:21<01:31, 1.96it/s]
Training 1/1 epoch (loss 1.5702): 43%|βββββ | 134/313 [01:22<01:31, 1.96it/s]
Training 1/1 epoch (loss 1.5702): 43%|βββββ | 135/313 [01:22<01:28, 2.01it/s]
Training 1/1 epoch (loss 1.5936): 43%|βββββ | 135/313 [01:22<01:28, 2.01it/s]
Training 1/1 epoch (loss 1.5936): 43%|βββββ | 136/313 [01:22<01:30, 1.95it/s]
Training 1/1 epoch (loss 1.6044): 43%|βββββ | 136/313 [01:23<01:30, 1.95it/s]
Training 1/1 epoch (loss 1.6044): 44%|βββββ | 137/313 [01:23<01:30, 1.94it/s]
Training 1/1 epoch (loss 1.4929): 44%|βββββ | 137/313 [01:23<01:30, 1.94it/s]
Training 1/1 epoch (loss 1.4929): 44%|βββββ | 138/313 [01:23<01:28, 1.97it/s]
Training 1/1 epoch (loss 1.5348): 44%|βββββ | 138/313 [01:24<01:28, 1.97it/s]
Training 1/1 epoch (loss 1.5348): 44%|βββββ | 139/313 [01:24<01:27, 1.99it/s]
Training 1/1 epoch (loss 1.5037): 44%|βββββ | 139/313 [01:24<01:27, 1.99it/s]
Training 1/1 epoch (loss 1.5037): 45%|βββββ | 140/313 [01:24<01:27, 1.98it/s]
Training 1/1 epoch (loss 1.6460): 45%|βββββ | 140/313 [01:25<01:27, 1.98it/s]
Training 1/1 epoch (loss 1.6460): 45%|βββββ | 141/313 [01:25<01:28, 1.95it/s]
Training 1/1 epoch (loss 1.5672): 45%|βββββ | 141/313 [01:25<01:28, 1.95it/s]
Training 1/1 epoch (loss 1.5672): 45%|βββββ | 142/313 [01:25<01:28, 1.93it/s]
Training 1/1 epoch (loss 1.4436): 45%|βββββ | 142/313 [01:26<01:28, 1.93it/s]
Training 1/1 epoch (loss 1.4436): 46%|βββββ | 143/313 [01:26<01:27, 1.94it/s]
Training 1/1 epoch (loss 1.5387): 46%|βββββ | 143/313 [01:27<01:27, 1.94it/s]
Training 1/1 epoch (loss 1.5387): 46%|βββββ | 144/313 [01:27<01:30, 1.86it/s]
Training 1/1 epoch (loss 1.4661): 46%|βββββ | 144/313 [01:27<01:30, 1.86it/s]
Training 1/1 epoch (loss 1.4661): 46%|βββββ | 145/313 [01:27<01:28, 1.89it/s]
Training 1/1 epoch (loss 1.5491): 46%|βββββ | 145/313 [01:28<01:28, 1.89it/s]
Training 1/1 epoch (loss 1.5491): 47%|βββββ | 146/313 [01:28<01:27, 1.91it/s]
Training 1/1 epoch (loss 1.4770): 47%|βββββ | 146/313 [01:28<01:27, 1.91it/s]
Training 1/1 epoch (loss 1.4770): 47%|βββββ | 147/313 [01:28<01:24, 1.96it/s]
Training 1/1 epoch (loss 1.6019): 47%|βββββ | 147/313 [01:29<01:24, 1.96it/s]
Training 1/1 epoch (loss 1.6019): 47%|βββββ | 148/313 [01:29<01:24, 1.96it/s]
Training 1/1 epoch (loss 1.5800): 47%|βββββ | 148/313 [01:29<01:24, 1.96it/s]
Training 1/1 epoch (loss 1.5800): 48%|βββββ | 149/313 [01:29<01:22, 1.98it/s]
Training 1/1 epoch (loss 1.6386): 48%|βββββ | 149/313 [01:30<01:22, 1.98it/s]
Training 1/1 epoch (loss 1.6386): 48%|βββββ | 150/313 [01:30<01:23, 1.95it/s]
Training 1/1 epoch (loss 1.5207): 48%|βββββ | 150/313 [01:30<01:23, 1.95it/s]
Training 1/1 epoch (loss 1.5207): 48%|βββββ | 151/313 [01:30<01:37, 1.66it/s]
Training 1/1 epoch (loss 1.5549): 48%|βββββ | 151/313 [01:31<01:37, 1.66it/s]
Training 1/1 epoch (loss 1.5549): 49%|βββββ | 152/313 [01:31<01:36, 1.67it/s]
Training 1/1 epoch (loss 1.3684): 49%|βββββ | 152/313 [01:31<01:36, 1.67it/s]
Training 1/1 epoch (loss 1.3684): 49%|βββββ | 153/313 [01:31<01:30, 1.77it/s]
Training 1/1 epoch (loss 1.5323): 49%|βββββ | 153/313 [01:32<01:30, 1.77it/s]
Training 1/1 epoch (loss 1.5323): 49%|βββββ | 154/313 [01:32<01:46, 1.49it/s]
Training 1/1 epoch (loss 1.6509): 49%|βββββ | 154/313 [01:33<01:46, 1.49it/s]
Training 1/1 epoch (loss 1.6509): 50%|βββββ | 155/313 [01:33<01:37, 1.62it/s]
Training 1/1 epoch (loss 1.6557): 50%|βββββ | 155/313 [01:33<01:37, 1.62it/s]
Training 1/1 epoch (loss 1.6557): 50%|βββββ | 156/313 [01:33<01:30, 1.73it/s]
Training 1/1 epoch (loss 1.4666): 50%|βββββ | 156/313 [01:34<01:30, 1.73it/s]
Training 1/1 epoch (loss 1.4666): 50%|βββββ | 157/313 [01:34<01:25, 1.81it/s]
Training 1/1 epoch (loss 1.4562): 50%|βββββ | 157/313 [01:34<01:25, 1.81it/s]
Training 1/1 epoch (loss 1.4562): 50%|βββββ | 158/313 [01:34<01:22, 1.88it/s]
Training 1/1 epoch (loss 1.4363): 50%|βββββ | 158/313 [01:35<01:22, 1.88it/s]
Training 1/1 epoch (loss 1.4363): 51%|βββββ | 159/313 [01:35<01:24, 1.83it/s]
Training 1/1 epoch (loss 1.5142): 51%|βββββ | 159/313 [01:35<01:24, 1.83it/s]
Training 1/1 epoch (loss 1.5142): 51%|βββββ | 160/313 [01:35<01:24, 1.81it/s]
Training 1/1 epoch (loss 1.5799): 51%|βββββ | 160/313 [01:36<01:24, 1.81it/s]
Training 1/1 epoch (loss 1.5799): 51%|ββββββ | 161/313 [01:36<01:20, 1.88it/s]
Training 1/1 epoch (loss 1.4413): 51%|ββββββ | 161/313 [01:37<01:20, 1.88it/s]
Training 1/1 epoch (loss 1.4413): 52%|ββββββ | 162/313 [01:37<01:21, 1.85it/s]
Training 1/1 epoch (loss 1.5799): 52%|ββββββ | 162/313 [01:37<01:21, 1.85it/s]
Training 1/1 epoch (loss 1.5799): 52%|ββββββ | 163/313 [01:37<01:19, 1.88it/s]
Training 1/1 epoch (loss 1.5006): 52%|ββββββ | 163/313 [01:38<01:19, 1.88it/s]
Training 1/1 epoch (loss 1.5006): 52%|ββββββ | 164/313 [01:38<01:17, 1.92it/s]
Training 1/1 epoch (loss 1.5377): 52%|ββββββ | 164/313 [01:38<01:17, 1.92it/s]
Training 1/1 epoch (loss 1.5377): 53%|ββββββ | 165/313 [01:38<01:15, 1.96it/s]
Training 1/1 epoch (loss 1.5450): 53%|ββββββ | 165/313 [01:39<01:15, 1.96it/s]
Training 1/1 epoch (loss 1.5450): 53%|ββββββ | 166/313 [01:39<01:14, 1.96it/s]
Training 1/1 epoch (loss 1.5932): 53%|ββββββ | 166/313 [01:39<01:14, 1.96it/s]
Training 1/1 epoch (loss 1.5932): 53%|ββββββ | 167/313 [01:39<01:15, 1.93it/s]
Training 1/1 epoch (loss 1.6285): 53%|ββββββ | 167/313 [01:40<01:15, 1.93it/s]
Training 1/1 epoch (loss 1.6285): 54%|ββββββ | 168/313 [01:40<01:16, 1.90it/s]
Training 1/1 epoch (loss 1.6247): 54%|ββββββ | 168/313 [01:40<01:16, 1.90it/s]
Training 1/1 epoch (loss 1.6247): 54%|ββββββ | 169/313 [01:40<01:16, 1.88it/s]
Training 1/1 epoch (loss 1.5738): 54%|ββββββ | 169/313 [01:41<01:16, 1.88it/s]
Training 1/1 epoch (loss 1.5738): 54%|ββββββ | 170/313 [01:41<01:14, 1.91it/s]
Training 1/1 epoch (loss 1.4806): 54%|ββββββ | 170/313 [01:41<01:14, 1.91it/s]
Training 1/1 epoch (loss 1.4806): 55%|ββββββ | 171/313 [01:41<01:13, 1.92it/s]
Training 1/1 epoch (loss 1.5660): 55%|ββββββ | 171/313 [01:42<01:13, 1.92it/s]
Training 1/1 epoch (loss 1.5660): 55%|ββββββ | 172/313 [01:42<01:11, 1.97it/s]
Training 1/1 epoch (loss 1.6140): 55%|ββββββ | 172/313 [01:42<01:11, 1.97it/s]
Training 1/1 epoch (loss 1.6140): 55%|ββββββ | 173/313 [01:42<01:13, 1.91it/s]
Training 1/1 epoch (loss 1.5801): 55%|ββββββ | 173/313 [01:43<01:13, 1.91it/s]
Training 1/1 epoch (loss 1.5801): 56%|ββββββ | 174/313 [01:43<01:15, 1.85it/s]
Training 1/1 epoch (loss 1.5022): 56%|ββββββ | 174/313 [01:43<01:15, 1.85it/s]
Training 1/1 epoch (loss 1.5022): 56%|ββββββ | 175/313 [01:43<01:17, 1.77it/s]
Training 1/1 epoch (loss 1.5984): 56%|ββββββ | 175/313 [01:44<01:17, 1.77it/s]
Training 1/1 epoch (loss 1.5984): 56%|ββββββ | 176/313 [01:44<01:19, 1.72it/s]
Training 1/1 epoch (loss 1.4311): 56%|ββββββ | 176/313 [01:45<01:19, 1.72it/s]
Training 1/1 epoch (loss 1.4311): 57%|ββββββ | 177/313 [01:45<01:18, 1.74it/s]
Training 1/1 epoch (loss 1.5171): 57%|ββββββ | 177/313 [01:45<01:18, 1.74it/s]
Training 1/1 epoch (loss 1.5171): 57%|ββββββ | 178/313 [01:45<01:15, 1.78it/s]
Training 1/1 epoch (loss 1.5258): 57%|ββββββ | 178/313 [01:46<01:15, 1.78it/s]
Training 1/1 epoch (loss 1.5258): 57%|ββββββ | 179/313 [01:46<01:14, 1.80it/s]
Training 1/1 epoch (loss 1.4348): 57%|ββββββ | 179/313 [01:46<01:14, 1.80it/s]
Training 1/1 epoch (loss 1.4348): 58%|ββββββ | 180/313 [01:46<01:12, 1.84it/s]
Training 1/1 epoch (loss 1.5704): 58%|ββββββ | 180/313 [01:47<01:12, 1.84it/s]
Training 1/1 epoch (loss 1.5704): 58%|ββββββ | 181/313 [01:47<01:12, 1.83it/s]
Training 1/1 epoch (loss 1.6025): 58%|ββββββ | 181/313 [01:47<01:12, 1.83it/s]
Training 1/1 epoch (loss 1.6025): 58%|ββββββ | 182/313 [01:47<01:12, 1.81it/s]
Training 1/1 epoch (loss 1.6030): 58%|ββββββ | 182/313 [01:48<01:12, 1.81it/s]
Training 1/1 epoch (loss 1.6030): 58%|ββββββ | 183/313 [01:48<01:11, 1.82it/s]
Training 1/1 epoch (loss 1.5418): 58%|ββββββ | 183/313 [01:48<01:11, 1.82it/s]
Training 1/1 epoch (loss 1.5418): 59%|ββββββ | 184/313 [01:48<01:12, 1.78it/s]
Training 1/1 epoch (loss 1.5492): 59%|ββββββ | 184/313 [01:49<01:12, 1.78it/s]
Training 1/1 epoch (loss 1.5492): 59%|ββββββ | 185/313 [01:49<01:09, 1.84it/s]
Training 1/1 epoch (loss 1.5663): 59%|ββββββ | 185/313 [01:49<01:09, 1.84it/s]
Training 1/1 epoch (loss 1.5663): 59%|ββββββ | 186/313 [01:49<01:07, 1.87it/s]
Training 1/1 epoch (loss 1.5376): 59%|ββββββ | 186/313 [01:50<01:07, 1.87it/s]
Training 1/1 epoch (loss 1.5376): 60%|ββββββ | 187/313 [01:50<01:06, 1.90it/s]
Training 1/1 epoch (loss 1.5649): 60%|ββββββ | 187/313 [01:50<01:06, 1.90it/s]
Training 1/1 epoch (loss 1.5649): 60%|ββββββ | 188/313 [01:50<01:04, 1.94it/s]
Training 1/1 epoch (loss 1.6079): 60%|ββββββ | 188/313 [01:51<01:04, 1.94it/s]
Training 1/1 epoch (loss 1.6079): 60%|ββββββ | 189/313 [01:51<01:05, 1.90it/s]
Training 1/1 epoch (loss 1.5164): 60%|ββββββ | 189/313 [01:51<01:05, 1.90it/s]
Training 1/1 epoch (loss 1.5164): 61%|ββββββ | 190/313 [01:51<01:03, 1.95it/s]
Training 1/1 epoch (loss 1.5304): 61%|ββββββ | 190/313 [01:52<01:03, 1.95it/s]
Training 1/1 epoch (loss 1.5304): 61%|ββββββ | 191/313 [01:52<01:01, 1.98it/s]
Training 1/1 epoch (loss 1.5957): 61%|ββββββ | 191/313 [01:52<01:01, 1.98it/s]
Training 1/1 epoch (loss 1.5957): 61%|βββββββ | 192/313 [01:52<01:02, 1.93it/s]
Training 1/1 epoch (loss 1.5496): 61%|βββββββ | 192/313 [01:53<01:02, 1.93it/s]
Training 1/1 epoch (loss 1.5496): 62%|βββββββ | 193/313 [01:53<01:02, 1.93it/s]
Training 1/1 epoch (loss 1.4442): 62%|βββββββ | 193/313 [01:53<01:02, 1.93it/s]
Training 1/1 epoch (loss 1.4442): 62%|βββββββ | 194/313 [01:54<01:00, 1.97it/s]
Training 1/1 epoch (loss 1.5231): 62%|βββββββ | 194/313 [01:54<01:00, 1.97it/s]
Training 1/1 epoch (loss 1.5231): 62%|βββββββ | 195/313 [01:54<01:00, 1.96it/s]
Training 1/1 epoch (loss 1.6017): 62%|βββββββ | 195/313 [01:55<01:00, 1.96it/s]
Training 1/1 epoch (loss 1.6017): 63%|βββββββ | 196/313 [01:55<00:58, 1.98it/s]
Training 1/1 epoch (loss 1.6107): 63%|βββββββ | 196/313 [01:55<00:58, 1.98it/s]
Training 1/1 epoch (loss 1.6107): 63%|βββββββ | 197/313 [01:55<00:58, 1.97it/s]
Training 1/1 epoch (loss 1.5406): 63%|βββββββ | 197/313 [01:56<00:58, 1.97it/s]
Training 1/1 epoch (loss 1.5406): 63%|βββββββ | 198/313 [01:56<01:00, 1.91it/s]
Training 1/1 epoch (loss 1.5351): 63%|βββββββ | 198/313 [01:56<01:00, 1.91it/s]
Training 1/1 epoch (loss 1.5351): 64%|βββββββ | 199/313 [01:56<00:58, 1.95it/s]
Training 1/1 epoch (loss 1.6008): 64%|βββββββ | 199/313 [01:57<00:58, 1.95it/s]
Training 1/1 epoch (loss 1.6008): 64%|βββββββ | 200/313 [01:57<00:58, 1.94it/s]
Training 1/1 epoch (loss 1.4579): 64%|βββββββ | 200/313 [01:57<00:58, 1.94it/s]
Training 1/1 epoch (loss 1.4579): 64%|βββββββ | 201/313 [01:57<00:58, 1.91it/s]
Training 1/1 epoch (loss 1.6351): 64%|βββββββ | 201/313 [01:58<00:58, 1.91it/s]
Training 1/1 epoch (loss 1.6351): 65%|βββββββ | 202/313 [01:58<00:57, 1.94it/s]
Training 1/1 epoch (loss 1.5972): 65%|βββββββ | 202/313 [01:58<00:57, 1.94it/s]
Training 1/1 epoch (loss 1.5972): 65%|βββββββ | 203/313 [01:58<00:56, 1.95it/s]
Training 1/1 epoch (loss 1.5667): 65%|βββββββ | 203/313 [01:59<00:56, 1.95it/s]
Training 1/1 epoch (loss 1.5667): 65%|βββββββ | 204/313 [01:59<00:55, 1.97it/s]
Training 1/1 epoch (loss 1.5388): 65%|βββββββ | 204/313 [01:59<00:55, 1.97it/s]
Training 1/1 epoch (loss 1.5388): 65%|βββββββ | 205/313 [01:59<00:54, 1.97it/s]
Training 1/1 epoch (loss 1.5047): 65%|βββββββ | 205/313 [02:00<00:54, 1.97it/s]
Training 1/1 epoch (loss 1.5047): 66%|βββββββ | 206/313 [02:00<00:53, 1.99it/s]
Training 1/1 epoch (loss 1.5249): 66%|βββββββ | 206/313 [02:00<00:53, 1.99it/s]
Training 1/1 epoch (loss 1.5249): 66%|βββββββ | 207/313 [02:00<00:53, 1.99it/s]
Training 1/1 epoch (loss 1.5106): 66%|βββββββ | 207/313 [02:01<00:53, 1.99it/s]
Training 1/1 epoch (loss 1.5106): 66%|βββββββ | 208/313 [02:01<00:54, 1.92it/s]
Training 1/1 epoch (loss 1.3844): 66%|βββββββ | 208/313 [02:01<00:54, 1.92it/s]
Training 1/1 epoch (loss 1.3844): 67%|βββββββ | 209/313 [02:01<00:54, 1.90it/s]
Training 1/1 epoch (loss 1.6928): 67%|βββββββ | 209/313 [02:02<00:54, 1.90it/s]
Training 1/1 epoch (loss 1.6928): 67%|βββββββ | 210/313 [02:02<00:54, 1.87it/s]
Training 1/1 epoch (loss 1.5265): 67%|βββββββ | 210/313 [02:02<00:54, 1.87it/s]
Training 1/1 epoch (loss 1.5265): 67%|βββββββ | 211/313 [02:02<00:54, 1.88it/s]
Training 1/1 epoch (loss 1.4486): 67%|βββββββ | 211/313 [02:03<00:54, 1.88it/s]
Training 1/1 epoch (loss 1.4486): 68%|βββββββ | 212/313 [02:03<00:52, 1.92it/s]
Training 1/1 epoch (loss 1.4562): 68%|βββββββ | 212/313 [02:03<00:52, 1.92it/s]
Training 1/1 epoch (loss 1.4562): 68%|βββββββ | 213/313 [02:03<00:51, 1.93it/s]
Training 1/1 epoch (loss 1.6171): 68%|βββββββ | 213/313 [02:04<00:51, 1.93it/s]
Training 1/1 epoch (loss 1.6171): 68%|βββββββ | 214/313 [02:04<00:50, 1.95it/s]
Training 1/1 epoch (loss 1.5040): 68%|βββββββ | 214/313 [02:04<00:50, 1.95it/s]
Training 1/1 epoch (loss 1.5040): 69%|βββββββ | 215/313 [02:04<00:49, 1.99it/s]
Training 1/1 epoch (loss 1.4248): 69%|βββββββ | 215/313 [02:05<00:49, 1.99it/s]
Training 1/1 epoch (loss 1.4248): 69%|βββββββ | 216/313 [02:05<00:50, 1.94it/s]
Training 1/1 epoch (loss 1.6806): 69%|βββββββ | 216/313 [02:05<00:50, 1.94it/s]
Training 1/1 epoch (loss 1.6806): 69%|βββββββ | 217/313 [02:05<00:50, 1.91it/s]
Training 1/1 epoch (loss 1.6686): 69%|βββββββ | 217/313 [02:06<00:50, 1.91it/s]
Training 1/1 epoch (loss 1.6686): 70%|βββββββ | 218/313 [02:06<00:49, 1.93it/s]
Training 1/1 epoch (loss 1.3250): 70%|βββββββ | 218/313 [02:06<00:49, 1.93it/s]
Training 1/1 epoch (loss 1.3250): 70%|βββββββ | 219/313 [02:06<00:48, 1.95it/s]
Training 1/1 epoch (loss 1.5984): 70%|βββββββ | 219/313 [02:07<00:48, 1.95it/s]
Training 1/1 epoch (loss 1.5984): 70%|βββββββ | 220/313 [02:07<00:47, 1.97it/s]
Training 1/1 epoch (loss 1.4931): 70%|βββββββ | 220/313 [02:07<00:47, 1.97it/s]
Training 1/1 epoch (loss 1.4931): 71%|βββββββ | 221/313 [02:07<00:46, 1.97it/s]
Training 1/1 epoch (loss 1.5225): 71%|βββββββ | 221/313 [02:08<00:46, 1.97it/s]
Training 1/1 epoch (loss 1.5225): 71%|βββββββ | 222/313 [02:08<00:45, 1.99it/s]
Training 1/1 epoch (loss 1.4674): 71%|βββββββ | 222/313 [02:08<00:45, 1.99it/s]
Training 1/1 epoch (loss 1.4674): 71%|βββββββ | 223/313 [02:08<00:44, 2.02it/s]
Training 1/1 epoch (loss 1.4249): 71%|βββββββ | 223/313 [02:09<00:44, 2.02it/s]
Training 1/1 epoch (loss 1.4249): 72%|ββββββββ | 224/313 [02:09<00:49, 1.79it/s]
Training 1/1 epoch (loss 1.6648): 72%|ββββββββ | 224/313 [02:10<00:49, 1.79it/s]
Training 1/1 epoch (loss 1.6648): 72%|ββββββββ | 225/313 [02:10<00:47, 1.84it/s]
Training 1/1 epoch (loss 1.4604): 72%|ββββββββ | 225/313 [02:10<00:47, 1.84it/s]
Training 1/1 epoch (loss 1.4604): 72%|ββββββββ | 226/313 [02:10<00:47, 1.83it/s]
Training 1/1 epoch (loss 1.4424): 72%|ββββββββ | 226/313 [02:11<00:47, 1.83it/s]
Training 1/1 epoch (loss 1.4424): 73%|ββββββββ | 227/313 [02:11<00:46, 1.86it/s]
Training 1/1 epoch (loss 1.4093): 73%|ββββββββ | 227/313 [02:11<00:46, 1.86it/s]
Training 1/1 epoch (loss 1.4093): 73%|ββββββββ | 228/313 [02:11<00:44, 1.93it/s]
Training 1/1 epoch (loss 1.5016): 73%|ββββββββ | 228/313 [02:12<00:44, 1.93it/s]
Training 1/1 epoch (loss 1.5016): 73%|ββββββββ | 229/313 [02:12<00:43, 1.93it/s]
Training 1/1 epoch (loss 1.5105): 73%|ββββββββ | 229/313 [02:12<00:43, 1.93it/s]
Training 1/1 epoch (loss 1.5105): 73%|ββββββββ | 230/313 [02:12<00:43, 1.93it/s]
Training 1/1 epoch (loss 1.5392): 73%|ββββββββ | 230/313 [02:13<00:43, 1.93it/s]
Training 1/1 epoch (loss 1.5392): 74%|ββββββββ | 231/313 [02:13<00:41, 1.96it/s]
Training 1/1 epoch (loss 1.6193): 74%|ββββββββ | 231/313 [02:13<00:41, 1.96it/s]
Training 1/1 epoch (loss 1.6193): 74%|ββββββββ | 232/313 [02:13<00:42, 1.92it/s]
Training 1/1 epoch (loss 1.5864): 74%|ββββββββ | 232/313 [02:14<00:42, 1.92it/s]
Training 1/1 epoch (loss 1.5864): 74%|ββββββββ | 233/313 [02:14<00:41, 1.94it/s]
Training 1/1 epoch (loss 1.5523): 74%|ββββββββ | 233/313 [02:14<00:41, 1.94it/s]
Training 1/1 epoch (loss 1.5523): 75%|ββββββββ | 234/313 [02:14<00:40, 1.94it/s]
Training 1/1 epoch (loss 1.6170): 75%|ββββββββ | 234/313 [02:15<00:40, 1.94it/s]
Training 1/1 epoch (loss 1.6170): 75%|ββββββββ | 235/313 [02:15<00:39, 1.97it/s]
Training 1/1 epoch (loss 1.4750): 75%|ββββββββ | 235/313 [02:15<00:39, 1.97it/s]
Training 1/1 epoch (loss 1.4750): 75%|ββββββββ | 236/313 [02:15<00:38, 1.99it/s]
Training 1/1 epoch (loss 1.5296): 75%|ββββββββ | 236/313 [02:16<00:38, 1.99it/s]
Training 1/1 epoch (loss 1.5296): 76%|ββββββββ | 237/313 [02:16<00:38, 1.99it/s]
Training 1/1 epoch (loss 1.4907): 76%|ββββββββ | 237/313 [02:16<00:38, 1.99it/s]
Training 1/1 epoch (loss 1.4907): 76%|ββββββββ | 238/313 [02:16<00:37, 2.02it/s]
Training 1/1 epoch (loss 1.5362): 76%|ββββββββ | 238/313 [02:17<00:37, 2.02it/s]
Training 1/1 epoch (loss 1.5362): 76%|ββββββββ | 239/313 [02:17<00:36, 2.01it/s]
Training 1/1 epoch (loss 1.5228): 76%|ββββββββ | 239/313 [02:17<00:36, 2.01it/s]
Training 1/1 epoch (loss 1.5228): 77%|ββββββββ | 240/313 [02:17<00:37, 1.95it/s]
Training 1/1 epoch (loss 1.4246): 77%|ββββββββ | 240/313 [02:18<00:37, 1.95it/s]
Training 1/1 epoch (loss 1.4246): 77%|ββββββββ | 241/313 [02:18<00:36, 1.96it/s]
Training 1/1 epoch (loss 1.5146): 77%|ββββββββ | 241/313 [02:18<00:36, 1.96it/s]
Training 1/1 epoch (loss 1.5146): 77%|ββββββββ | 242/313 [02:18<00:35, 1.99it/s]
Training 1/1 epoch (loss 1.6600): 77%|ββββββββ | 242/313 [02:19<00:35, 1.99it/s]
Training 1/1 epoch (loss 1.6600): 78%|ββββββββ | 243/313 [02:19<00:34, 2.00it/s]
Training 1/1 epoch (loss 1.5288): 78%|ββββββββ | 243/313 [02:19<00:34, 2.00it/s]
Training 1/1 epoch (loss 1.5288): 78%|ββββββββ | 244/313 [02:19<00:34, 1.99it/s]
Training 1/1 epoch (loss 1.5913): 78%|ββββββββ | 244/313 [02:20<00:34, 1.99it/s]
Training 1/1 epoch (loss 1.5913): 78%|ββββββββ | 245/313 [02:20<00:34, 1.99it/s]
Training 1/1 epoch (loss 1.5486): 78%|ββββββββ | 245/313 [02:20<00:34, 1.99it/s]
Training 1/1 epoch (loss 1.5486): 79%|ββββββββ | 246/313 [02:20<00:33, 2.00it/s]
Training 1/1 epoch (loss 1.5925): 79%|ββββββββ | 246/313 [02:21<00:33, 2.00it/s]
Training 1/1 epoch (loss 1.5925): 79%|ββββββββ | 247/313 [02:21<00:33, 1.98it/s]
Training 1/1 epoch (loss 1.4015): 79%|ββββββββ | 247/313 [02:21<00:33, 1.98it/s]
Training 1/1 epoch (loss 1.4015): 79%|ββββββββ | 248/313 [02:21<00:33, 1.95it/s]
Training 1/1 epoch (loss 1.5104): 79%|ββββββββ | 248/313 [02:22<00:33, 1.95it/s]
Training 1/1 epoch (loss 1.5104): 80%|ββββββββ | 249/313 [02:22<00:32, 1.95it/s]
Training 1/1 epoch (loss 1.5787): 80%|ββββββββ | 249/313 [02:22<00:32, 1.95it/s]
Training 1/1 epoch (loss 1.5787): 80%|ββββββββ | 250/313 [02:22<00:31, 1.98it/s]
Training 1/1 epoch (loss 1.6359): 80%|ββββββββ | 250/313 [02:23<00:31, 1.98it/s]
Training 1/1 epoch (loss 1.6359): 80%|ββββββββ | 251/313 [02:23<00:31, 1.98it/s]
Training 1/1 epoch (loss 1.5411): 80%|ββββββββ | 251/313 [02:23<00:31, 1.98it/s]
Training 1/1 epoch (loss 1.5411): 81%|ββββββββ | 252/313 [02:23<00:30, 1.99it/s]
Training 1/1 epoch (loss 1.5336): 81%|ββββββββ | 252/313 [02:24<00:30, 1.99it/s]
Training 1/1 epoch (loss 1.5336): 81%|ββββββββ | 253/313 [02:24<00:29, 2.00it/s]
Training 1/1 epoch (loss 1.5110): 81%|ββββββββ | 253/313 [02:24<00:29, 2.00it/s]
Training 1/1 epoch (loss 1.5110): 81%|ββββββββ | 254/313 [02:24<00:29, 2.01it/s]
Training 1/1 epoch (loss 1.5857): 81%|ββββββββ | 254/313 [02:25<00:29, 2.01it/s]
Training 1/1 epoch (loss 1.5857): 81%|βββββββββ | 255/313 [02:25<00:29, 1.99it/s]
Training 1/1 epoch (loss 1.5864): 81%|βββββββββ | 255/313 [02:25<00:29, 1.99it/s]
Training 1/1 epoch (loss 1.5864): 82%|βββββββββ | 256/313 [02:25<00:29, 1.94it/s]
Training 1/1 epoch (loss 1.6426): 82%|βββββββββ | 256/313 [02:26<00:29, 1.94it/s]
Training 1/1 epoch (loss 1.6426): 82%|βββββββββ | 257/313 [02:26<00:29, 1.91it/s]
Training 1/1 epoch (loss 1.5753): 82%|βββββββββ | 257/313 [02:26<00:29, 1.91it/s]
Training 1/1 epoch (loss 1.5753): 82%|βββββββββ | 258/313 [02:26<00:28, 1.94it/s]
Training 1/1 epoch (loss 1.4491): 82%|βββββββββ | 258/313 [02:27<00:28, 1.94it/s]
Training 1/1 epoch (loss 1.4491): 83%|βββββββββ | 259/313 [02:27<00:27, 1.93it/s]
Training 1/1 epoch (loss 1.5715): 83%|βββββββββ | 259/313 [02:27<00:27, 1.93it/s]
Training 1/1 epoch (loss 1.5715): 83%|βββββββββ | 260/313 [02:27<00:27, 1.95it/s]
Training 1/1 epoch (loss 1.5549): 83%|βββββββββ | 260/313 [02:28<00:27, 1.95it/s]
Training 1/1 epoch (loss 1.5549): 83%|βββββββββ | 261/313 [02:28<00:26, 1.97it/s]
Training 1/1 epoch (loss 1.4952): 83%|βββββββββ | 261/313 [02:28<00:26, 1.97it/s]
Training 1/1 epoch (loss 1.4952): 84%|βββββββββ | 262/313 [02:28<00:25, 2.00it/s]
Training 1/1 epoch (loss 1.5054): 84%|βββββββββ | 262/313 [02:29<00:25, 2.00it/s]
Training 1/1 epoch (loss 1.5054): 84%|βββββββββ | 263/313 [02:29<00:25, 1.98it/s]
Training 1/1 epoch (loss 1.5038): 84%|βββββββββ | 263/313 [02:29<00:25, 1.98it/s]
Training 1/1 epoch (loss 1.5038): 84%|βββββββββ | 264/313 [02:29<00:25, 1.95it/s]
Training 1/1 epoch (loss 1.3641): 84%|βββββββββ | 264/313 [02:30<00:25, 1.95it/s]
Training 1/1 epoch (loss 1.3641): 85%|βββββββββ | 265/313 [02:30<00:24, 1.96it/s]
Training 1/1 epoch (loss 1.5528): 85%|βββββββββ | 265/313 [02:30<00:24, 1.96it/s]
Training 1/1 epoch (loss 1.5528): 85%|βββββββββ | 266/313 [02:30<00:23, 2.00it/s]
Training 1/1 epoch (loss 1.5745): 85%|βββββββββ | 266/313 [02:31<00:23, 2.00it/s]
Training 1/1 epoch (loss 1.5745): 85%|βββββββββ | 267/313 [02:31<00:23, 2.00it/s]
Training 1/1 epoch (loss 1.5432): 85%|βββββββββ | 267/313 [02:31<00:23, 2.00it/s]
Training 1/1 epoch (loss 1.5432): 86%|βββββββββ | 268/313 [02:31<00:22, 2.01it/s]
Training 1/1 epoch (loss 1.4954): 86%|βββββββββ | 268/313 [02:32<00:22, 2.01it/s]
Training 1/1 epoch (loss 1.4954): 86%|βββββββββ | 269/313 [02:32<00:22, 1.93it/s]
Training 1/1 epoch (loss 1.5393): 86%|βββββββββ | 269/313 [02:32<00:22, 1.93it/s]
Training 1/1 epoch (loss 1.5393): 86%|βββββββββ | 270/313 [02:32<00:21, 1.95it/s]
Training 1/1 epoch (loss 1.5616): 86%|βββββββββ | 270/313 [02:33<00:21, 1.95it/s]
Training 1/1 epoch (loss 1.5616): 87%|βββββββββ | 271/313 [02:33<00:21, 1.92it/s]
Training 1/1 epoch (loss 1.3451): 87%|βββββββββ | 271/313 [02:34<00:21, 1.92it/s]
Training 1/1 epoch (loss 1.3451): 87%|βββββββββ | 272/313 [02:34<00:22, 1.80it/s]
Training 1/1 epoch (loss 1.5292): 87%|βββββββββ | 272/313 [02:34<00:22, 1.80it/s]
Training 1/1 epoch (loss 1.5292): 87%|βββββββββ | 273/313 [02:34<00:22, 1.78it/s]
Training 1/1 epoch (loss 1.4748): 87%|βββββββββ | 273/313 [02:35<00:22, 1.78it/s]
Training 1/1 epoch (loss 1.4748): 88%|βββββββββ | 274/313 [02:35<00:21, 1.85it/s]
Training 1/1 epoch (loss 1.5078): 88%|βββββββββ | 274/313 [02:35<00:21, 1.85it/s]
Training 1/1 epoch (loss 1.5078): 88%|βββββββββ | 275/313 [02:35<00:20, 1.88it/s]
Training 1/1 epoch (loss 1.6202): 88%|βββββββββ | 275/313 [02:36<00:20, 1.88it/s]
Training 1/1 epoch (loss 1.6202): 88%|βββββββββ | 276/313 [02:36<00:19, 1.90it/s]
Training 1/1 epoch (loss 1.5921): 88%|βββββββββ | 276/313 [02:36<00:19, 1.90it/s]
Training 1/1 epoch (loss 1.5921): 88%|βββββββββ | 277/313 [02:36<00:18, 1.90it/s]
Training 1/1 epoch (loss 1.5037): 88%|βββββββββ | 277/313 [02:37<00:18, 1.90it/s]
Training 1/1 epoch (loss 1.5037): 89%|βββββββββ | 278/313 [02:37<00:18, 1.89it/s]
Training 1/1 epoch (loss 1.5524): 89%|βββββββββ | 278/313 [02:37<00:18, 1.89it/s]
Training 1/1 epoch (loss 1.5524): 89%|βββββββββ | 279/313 [02:37<00:17, 1.91it/s]
Training 1/1 epoch (loss 1.5660): 89%|βββββββββ | 279/313 [02:38<00:17, 1.91it/s]
Training 1/1 epoch (loss 1.5660): 89%|βββββββββ | 280/313 [02:38<00:17, 1.89it/s]
Training 1/1 epoch (loss 1.6003): 89%|βββββββββ | 280/313 [02:38<00:17, 1.89it/s]
Training 1/1 epoch (loss 1.6003): 90%|βββββββββ | 281/313 [02:38<00:16, 1.90it/s]
Training 1/1 epoch (loss 1.5924): 90%|βββββββββ | 281/313 [02:39<00:16, 1.90it/s]
Training 1/1 epoch (loss 1.5924): 90%|βββββββββ | 282/313 [02:39<00:16, 1.92it/s]
Training 1/1 epoch (loss 1.4905): 90%|βββββββββ | 282/313 [02:39<00:16, 1.92it/s]
Training 1/1 epoch (loss 1.4905): 90%|βββββββββ | 283/313 [02:39<00:16, 1.86it/s]
Training 1/1 epoch (loss 1.5921): 90%|βββββββββ | 283/313 [02:40<00:16, 1.86it/s]
Training 1/1 epoch (loss 1.5921): 91%|βββββββββ | 284/313 [02:40<00:15, 1.92it/s]
Training 1/1 epoch (loss 1.4911): 91%|βββββββββ | 284/313 [02:40<00:15, 1.92it/s]
Training 1/1 epoch (loss 1.4911): 91%|βββββββββ | 285/313 [02:40<00:14, 1.90it/s]
Training 1/1 epoch (loss 1.5442): 91%|βββββββββ | 285/313 [02:41<00:14, 1.90it/s]
Training 1/1 epoch (loss 1.5442): 91%|ββββββββββ| 286/313 [02:41<00:13, 1.95it/s]
Training 1/1 epoch (loss 1.7276): 91%|ββββββββββ| 286/313 [02:41<00:13, 1.95it/s]
Training 1/1 epoch (loss 1.7276): 92%|ββββββββββ| 287/313 [02:41<00:13, 1.97it/s]
Training 1/1 epoch (loss 1.6731): 92%|ββββββββββ| 287/313 [02:42<00:13, 1.97it/s]
Training 1/1 epoch (loss 1.6731): 92%|ββββββββββ| 288/313 [02:42<00:12, 1.94it/s]
Training 1/1 epoch (loss 1.5480): 92%|ββββββββββ| 288/313 [02:42<00:12, 1.94it/s]
Training 1/1 epoch (loss 1.5480): 92%|ββββββββββ| 289/313 [02:42<00:12, 1.94it/s]
Training 1/1 epoch (loss 1.5241): 92%|ββββββββββ| 289/313 [02:43<00:12, 1.94it/s]
Training 1/1 epoch (loss 1.5241): 93%|ββββββββββ| 290/313 [02:43<00:11, 1.97it/s]
Training 1/1 epoch (loss 1.4544): 93%|ββββββββββ| 290/313 [02:43<00:11, 1.97it/s]
Training 1/1 epoch (loss 1.4544): 93%|ββββββββββ| 291/313 [02:43<00:11, 1.98it/s]
Training 1/1 epoch (loss 1.5814): 93%|ββββββββββ| 291/313 [02:44<00:11, 1.98it/s]
Training 1/1 epoch (loss 1.5814): 93%|ββββββββββ| 292/313 [02:44<00:10, 2.00it/s]
Training 1/1 epoch (loss 1.4412): 93%|ββββββββββ| 292/313 [02:44<00:10, 2.00it/s]
Training 1/1 epoch (loss 1.4412): 94%|ββββββββββ| 293/313 [02:44<00:10, 2.00it/s]
Training 1/1 epoch (loss 1.5699): 94%|ββββββββββ| 293/313 [02:45<00:10, 2.00it/s]
Training 1/1 epoch (loss 1.5699): 94%|ββββββββββ| 294/313 [02:45<00:09, 1.97it/s]
Training 1/1 epoch (loss 1.4861): 94%|ββββββββββ| 294/313 [02:45<00:09, 1.97it/s]
Training 1/1 epoch (loss 1.4861): 94%|ββββββββββ| 295/313 [02:45<00:09, 1.98it/s]
Training 1/1 epoch (loss 1.5030): 94%|ββββββββββ| 295/313 [02:46<00:09, 1.98it/s]
Training 1/1 epoch (loss 1.5030): 95%|ββββββββββ| 296/313 [02:46<00:08, 1.95it/s]
Training 1/1 epoch (loss 1.4301): 95%|ββββββββββ| 296/313 [02:47<00:08, 1.95it/s]
Training 1/1 epoch (loss 1.4301): 95%|ββββββββββ| 297/313 [02:47<00:08, 1.96it/s]
Training 1/1 epoch (loss 1.6348): 95%|ββββββββββ| 297/313 [02:47<00:08, 1.96it/s]
Training 1/1 epoch (loss 1.6348): 95%|ββββββββββ| 298/313 [02:47<00:07, 1.92it/s]
Training 1/1 epoch (loss 1.4022): 95%|ββββββββββ| 298/313 [02:48<00:07, 1.92it/s]
Training 1/1 epoch (loss 1.4022): 96%|ββββββββββ| 299/313 [02:48<00:07, 1.94it/s]
Training 1/1 epoch (loss 1.4450): 96%|ββββββββββ| 299/313 [02:48<00:07, 1.94it/s]
Training 1/1 epoch (loss 1.4450): 96%|ββββββββββ| 300/313 [02:48<00:06, 1.94it/s]
Training 1/1 epoch (loss 1.3780): 96%|ββββββββββ| 300/313 [02:49<00:06, 1.94it/s]
Training 1/1 epoch (loss 1.3780): 96%|ββββββββββ| 301/313 [02:49<00:06, 1.88it/s]
Training 1/1 epoch (loss 1.5478): 96%|ββββββββββ| 301/313 [02:49<00:06, 1.88it/s]
Training 1/1 epoch (loss 1.5478): 96%|ββββββββββ| 302/313 [02:49<00:05, 1.89it/s]
Training 1/1 epoch (loss 1.5449): 96%|ββββββββββ| 302/313 [02:50<00:05, 1.89it/s]
Training 1/1 epoch (loss 1.5449): 97%|ββββββββββ| 303/313 [02:50<00:05, 1.91it/s]
Training 1/1 epoch (loss 1.5525): 97%|ββββββββββ| 303/313 [02:50<00:05, 1.91it/s]
Training 1/1 epoch (loss 1.5525): 97%|ββββββββββ| 304/313 [02:50<00:04, 1.88it/s]
Training 1/1 epoch (loss 1.5321): 97%|ββββββββββ| 304/313 [02:51<00:04, 1.88it/s]
Training 1/1 epoch (loss 1.5321): 97%|ββββββββββ| 305/313 [02:51<00:04, 1.94it/s]
Training 1/1 epoch (loss 1.3883): 97%|ββββββββββ| 305/313 [02:51<00:04, 1.94it/s]
Training 1/1 epoch (loss 1.3883): 98%|ββββββββββ| 306/313 [02:51<00:03, 1.95it/s]
Training 1/1 epoch (loss 1.5585): 98%|ββββββββββ| 306/313 [02:52<00:03, 1.95it/s]
Training 1/1 epoch (loss 1.5585): 98%|ββββββββββ| 307/313 [02:52<00:03, 1.98it/s]
Training 1/1 epoch (loss 1.5036): 98%|ββββββββββ| 307/313 [02:52<00:03, 1.98it/s]
Training 1/1 epoch (loss 1.5036): 98%|ββββββββββ| 308/313 [02:52<00:02, 1.98it/s]
Training 1/1 epoch (loss 1.5408): 98%|ββββββββββ| 308/313 [02:53<00:02, 1.98it/s]
Training 1/1 epoch (loss 1.5408): 99%|ββββββββββ| 309/313 [02:53<00:01, 2.00it/s]
Training 1/1 epoch (loss 1.4019): 99%|ββββββββββ| 309/313 [02:53<00:01, 2.00it/s]
Training 1/1 epoch (loss 1.4019): 99%|ββββββββββ| 310/313 [02:53<00:01, 1.95it/s]
Training 1/1 epoch (loss 1.4451): 99%|ββββββββββ| 310/313 [02:54<00:01, 1.95it/s]
Training 1/1 epoch (loss 1.4451): 99%|ββββββββββ| 311/313 [02:54<00:01, 1.96it/s]
Training 1/1 epoch (loss 1.5089): 99%|ββββββββββ| 311/313 [02:54<00:01, 1.96it/s]
Training 1/1 epoch (loss 1.5089): 100%|ββββββββββ| 312/313 [02:54<00:00, 1.93it/s]
Training 1/1 epoch (loss 1.3117): 100%|ββββββββββ| 312/313 [02:55<00:00, 1.93it/s]
Training 1/1 epoch (loss 1.3117): 100%|ββββββββββ| 313/313 [02:55<00:00, 1.90it/s]
Training 1/1 epoch (loss 1.3117): 100%|ββββββββββ| 313/313 [02:55<00:00, 1.79it/s] |
|
tokenizer config file saved in /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/tokenizer_config.json |
|
Special tokens file saved in /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/special_tokens_map.json |
|
wandb: ERROR Problem finishing run |
|
Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x1550a56d6650>> |
|
Traceback (most recent call last): |
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper |
|
return func(*args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^ |
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close |
|
self.wandb.finish() |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 449, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 391, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2106, in finish |
|
return self._finish(exit_code) |
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2127, in _finish |
|
self._atexit_cleanup(exit_code=exit_code) |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2352, in _atexit_cleanup |
|
self._on_finish() |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2609, in _on_finish |
|
wait_with_progress( |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress |
|
return wait_all_with_progress( |
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress |
|
return asyncio_compat.run(progress_loop_with_timeout) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run |
|
future = executor.submit(runner.run, fn) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/concurrent/futures/thread.py", line 169, in submit |
|
raise RuntimeError('cannot schedule new futures after ' |
|
RuntimeError: cannot schedule new futures after interpreter shutdown |
|
|