0: [2025-08-09 11:42:17,057] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:118] [PID:619259] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` 0: [2025-08-09 11:42:17,057] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:217] [PID:619259] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing 3: [2025-08-09 11:42:17,057] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:118] [PID:1158470] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` 2: [2025-08-09 11:42:17,057] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:118] [PID:696294] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` 1: [2025-08-09 11:42:17,057] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:118] [PID:463492] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` 2: [2025-08-09 11:42:17,057] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:217] [PID:696294] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing 3: [2025-08-09 11:42:17,057] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:217] [PID:1158470] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing 1: [2025-08-09 11:42:17,057] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:217] [PID:463492] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing 0: [2025-08-09 11:42:36,408] [INFO] [axolotl.cli.config.load_cfg:244] [PID:619259] [RANK:0] config: 0: { 0: "activation_offloading": false, 0: "auto_resume_from_checkpoints": true, 0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1754731818992423200.yaml", 0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-7B", 0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-7B", 0: "batch_size": 16, 0: "bf16": true, 0: "capabilities": { 0: "bf16": true, 0: "compute_capability": "sm_90", 0: "fp8": false, 0: "n_gpu": 4, 0: "n_node": 1 0: }, 0: "chat_template": "qwen_25", 0: "dataloader_num_workers": 4, 0: "dataloader_pin_memory": true, 0: "dataloader_prefetch_factor": 256, 0: "dataset_prepared_path": "/lustre/fsn1/projects/rech/dgo/udv55np/dataset/Qwen3-235B-A22B/Qwen2.5-7B/1", 0: "dataset_processes": 192, 0: "datasets": [ 0: { 0: "chat_template": "tokenizer_default", 0: "data_files": [ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0007.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0009.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0005.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0006.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0014.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0010.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0012.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0008.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0001.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0002.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0013.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0015.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0004.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0011.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0000.jsonl", 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0003.jsonl" 0: ], 0: "ds_type": "json", 0: "field_messages": "conversations", 0: "message_property_mappings": { 0: "content": "content", 0: "role": "role" 0: }, 0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking", 0: "trust_remote_code": false, 0: "type": "chat_template" 0: } 0: ], 0: "ddp": true, 0: "deepspeed": { 0: "bf16": { 0: "enabled": true 0: }, 0: "gradient_accumulation_steps": "auto", 0: "gradient_clipping": "auto", 0: "train_batch_size": "auto", 0: "train_micro_batch_size_per_gpu": "auto", 0: "wall_clock_breakdown": false, 0: "zero_optimization": { 0: "contiguous_gradients": true, 0: "overlap_comm": true, 0: "reduce_bucket_size": "auto", 0: "stage": 3, 0: "stage3_gather_16bit_weights_on_model_save": true, 0: "stage3_param_persistence_threshold": "auto", 0: "stage3_prefetch_bucket_size": "auto", 0: "sub_group_size": 0 0: } 0: }, 0: "device": "cuda:0", 0: "device_map": { 0: "": 0 0: }, 0: "env_capabilities": { 0: "torch_version": "2.6.0" 0: }, 0: "eval_batch_size": 1, 0: "eval_causal_lm_metrics": [ 0: "sacrebleu", 0: "comet", 0: "ter", 0: "chrf" 0: ], 0: "eval_max_new_tokens": 128, 0: "eval_sample_packing": true, 0: "eval_table_size": 0, 0: "evals_per_epoch": 0, 0: "flash_attention": true, 0: "fp16": false, 0: "gradient_accumulation_steps": 4, 0: "gradient_checkpointing": true, 0: "gradient_checkpointing_kwargs": { 0: "use_reentrant": true 0: }, 0: "learning_rate": 5e-06, 0: "lisa_layers_attribute": "model.layers", 0: "load_best_model_at_end": false, 0: "load_in_4bit": false, 0: "load_in_8bit": false, 0: "local_rank": 0, 0: "logging_steps": 10, 0: "lora_dropout": 0.0, 0: "loraplus_lr_embedding": 1e-06, 0: "lr_scheduler": "warmup_stable_decay", 0: "lr_scheduler_kwargs": { 0: "min_lr_ratio": 0.1, 0: "num_decay_steps": 300 0: }, 0: "max_prompt_len": 512, 0: "mean_resizing_embeddings": false, 0: "micro_batch_size": 1, 0: "model_config_type": "qwen2", 0: "num_epochs": 1.0, 0: "optimizer": "adamw_torch_fused", 0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/ift/Qwen3-235B-A22B/Qwen2.5-7B/1", 0: "pad_to_sequence_len": true, 0: "pretrain_multipack_attn": true, 0: "pretrain_multipack_buffer_size": 10000, 0: "profiler_steps_start": 0, 0: "qlora_sharded_model_loading": false, 0: "ray_num_workers": 1, 0: "resources_per_worker": { 0: "GPU": 1 0: }, 0: "sample_packing": true, 0: "sample_packing_bin_size": 200, 0: "sample_packing_group_size": 100000, 0: "save_only_model": false, 0: "save_safetensors": true, 0: "save_steps": 0.2, 0: "save_total_limit": 20, 0: "sequence_len": 16384, 0: "sequence_parallel_degree": 1, 0: "shuffle_merged_datasets": true, 0: "skip_prepare_dataset": false, 0: "special_tokens": { 0: "bos_token": "<|im_start|>", 0: "eos_token": "<|im_end|>", 0: "pad_token": "<|endoftext|>" 0: }, 0: "strict": false, 0: "tensor_parallel_size": 1, 0: "tf32": false, 0: "tiled_mlp_use_original_mlp": true, 0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-7B", 0: "torch_dtype": "torch.bfloat16", 0: "train_on_inputs": false, 0: "trl": { 0: "log_completions": false, 0: "mask_truncated_completions": false, 0: "ref_model_mixup_alpha": 0.9, 0: "ref_model_sync_steps": 64, 0: "scale_rewards": true, 0: "sync_ref_model": false, 0: "use_vllm": false, 0: "vllm_server_host": "0.0.0.0", 0: "vllm_server_port": 8000 0: }, 0: "use_ray": false, 0: "use_tensorboard": true, 0: "val_set_size": 0.0, 0: "vllm": { 0: "device": "auto", 0: "dtype": "auto", 0: "gpu_memory_utilization": 0.9, 0: "host": "0.0.0.0", 0: "port": 8000 0: }, 0: "warmup_steps": 150, 0: "weight_decay": 0.0, 0: "world_size": 4 0: } 0: [2025-08-09 11:42:36,409] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:619259] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used. 0: [2025-08-09 11:42:37,286] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:471] [PID:619259] [RANK:0] Loading prepared dataset from disk at /lustre/fsn1/projects/rech/dgo/udv55np/dataset/Qwen3-235B-A22B/Qwen2.5-7B/1/5f27dbd9b7ab95f9ee1213e1d97bc04e... 0: [2025-08-09 11:44:37,617] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:435] [PID:619259] [RANK:0] gather_len_batches: [235607, 235607, 235609, 235608] 0: [2025-08-09 11:44:37,640] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:496] [PID:619259] [RANK:0] sample_packing_eff_est across ranks: [0.9936812520027161, 0.9936348795890808, 0.9936854839324951, 0.9936475157737732] 0: [2025-08-09 11:44:37,660] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:123] [PID:619259] [RANK:0] Maximum number of steps set at 14725 0: You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. 2: You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. 1: You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. 3: You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. 3: Loading checkpoint shards: 0%| | 0/4 [00:00 + 0x6311918 (0x14c6a14cf918 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) 3: frame #2: + 0x6311f39 (0x14c6a14cff39 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) 3: frame #3: + 0x6314e57 (0x14c6a14d2e57 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) 3: frame #4: c10d::TCPStore::compareSet(std::string const&, std::vector > const&, std::vector > const&) + 0x24c (0x14c6a14cc7bc in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) 3: frame #5: + 0xe1c9d5 (0x14c6b10c89d5 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_python.so) 3: frame #6: + 0x51a017 (0x14c6b07c6017 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_python.so) 3: frame #7: + 0x1ffdc6 (0x5597b002bdc6 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #8: _PyObject_MakeTpCall + 0x25b (0x5597b000c07b in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #9: + 0x22f625 (0x5597b005b625 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #10: _PyEval_EvalFrameDefault + 0x4327 (0x5597b001c4e7 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #11: _PyFunction_Vectorcall + 0x181 (0x5597b003b121 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #12: _PyEval_EvalFrameDefault + 0x4327 (0x5597b001c4e7 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #13: _PyFunction_Vectorcall + 0x181 (0x5597b003b121 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #14: _PyObject_FastCallDictTstate + 0x63 (0x5597b000f853 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #15: _PyObject_Call_Prepend + 0x69 (0x5597b0042e09 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #16: + 0x2e99b9 (0x5597b01159b9 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #17: PyObject_Call + 0x1ed (0x5597b0044ebd in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #18: _PyEval_EvalFrameDefault + 0x4327 (0x5597b001c4e7 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #19: _PyFunction_Vectorcall + 0x181 (0x5597b003b121 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #20: _PyEval_EvalFrameDefault + 0x4327 (0x5597b001c4e7 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #21: + 0x2a4d36 (0x5597b00d0d36 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #22: PyEval_EvalCode + 0x9f (0x5597b00d03ef in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #23: + 0x2c2f2a (0x5597b00eef2a in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #24: + 0x2bf343 (0x5597b00eb343 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #25: + 0x2d4300 (0x5597b0100300 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #26: _PyRun_SimpleFileObject + 0x1ae (0x5597b00ffc5e in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #27: _PyRun_AnyFileObject + 0x44 (0x5597b00ffa44 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #28: Py_RunMain + 0x2df (0x5597b00f9bdf in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #29: Py_BytesMain + 0x37 (0x5597b00bef97 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #30: + 0x29590 (0x14c6b9629590 in /lib64/libc.so.6) 3: frame #31: __libc_start_main + 0x80 (0x14c6b9629640 in /lib64/libc.so.6) 3: frame #32: + 0x292e3d (0x5597b00bee3d in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: 3: W0809 20:00:36.953000 1158391 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1284] The node 'jzxh305.hpc.idris.fr_1158391_0' has failed to shutdown the rendezvous '2157844' due to an error of type RendezvousConnectionError. 3: [W809 20:00:36.500066626 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=5, addr=[jzxh305]:49614, remote=[jzxh277]:29400): Broken pipe 3: Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:646 (most recent call first): 3: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x14c665ecb1b6 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libc10.so) 3: frame #1: + 0x6311c1d (0x14c6a14cfc1d in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) 3: frame #2: + 0x63146ac (0x14c6a14d26ac in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) 3: frame #3: + 0x631484d (0x14c6a14d284d in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) 3: frame #4: c10d::TCPStore::compareSet(std::string const&, std::vector > const&, std::vector > const&) + 0x310 (0x14c6a14cc880 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) 3: frame #5: + 0xe1c9d5 (0x14c6b10c89d5 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_python.so) 3: frame #6: + 0x51a017 (0x14c6b07c6017 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/lib/python3.11/site-packages/torch/lib/libtorch_python.so) 3: frame #7: + 0x1ffdc6 (0x5597b002bdc6 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #8: _PyObject_MakeTpCall + 0x25b (0x5597b000c07b in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #9: + 0x22f625 (0x5597b005b625 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #10: _PyEval_EvalFrameDefault + 0x4327 (0x5597b001c4e7 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #11: _PyFunction_Vectorcall + 0x181 (0x5597b003b121 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #12: _PyObject_FastCallDictTstate + 0x63 (0x5597b000f853 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #13: _PyObject_Call_Prepend + 0x69 (0x5597b0042e09 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #14: + 0x2e99b9 (0x5597b01159b9 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #15: PyObject_Call + 0x1ed (0x5597b0044ebd in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #16: _PyEval_EvalFrameDefault + 0x4327 (0x5597b001c4e7 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #17: _PyFunction_Vectorcall + 0x181 (0x5597b003b121 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #18: _PyEval_EvalFrameDefault + 0x4327 (0x5597b001c4e7 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #19: + 0x2a4d36 (0x5597b00d0d36 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #20: PyEval_EvalCode + 0x9f (0x5597b00d03ef in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #21: + 0x2c2f2a (0x5597b00eef2a in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #22: + 0x2bf343 (0x5597b00eb343 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #23: + 0x2d4300 (0x5597b0100300 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #24: _PyRun_SimpleFileObject + 0x1ae (0x5597b00ffc5e in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #25: _PyRun_AnyFileObject + 0x44 (0x5597b00ffa44 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #26: Py_RunMain + 0x2df (0x5597b00f9bdf in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #27: Py_BytesMain + 0x37 (0x5597b00bef97 in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: frame #28: + 0x29590 (0x14c6b9629590 in /lib64/libc.so.6) 3: frame #29: __libc_start_main + 0x80 (0x14c6b9629640 in /lib64/libc.so.6) 3: frame #30: + 0x292e3d (0x5597b00bee3d in /lustre/fswork/projects/rech/qwv/udv55np/axolotl/bin/python) 3: 3: W0809 20:00:36.958000 1158391 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1284] The node 'jzxh305.hpc.idris.fr_1158391_0' has failed to shutdown the rendezvous '2157844' due to an error of type RendezvousConnectionError.