File size: 16,333 Bytes
0a6cb98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
[rank0]:[E609 00:24:47.186853648 ProcessGroupNCCL.cpp:629] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. [rank0]:[E609 00:24:47.189780854 ProcessGroupNCCL.cpp:2168] [PG ID 0 PG GUID 0(default_pg) Rank 0] failure detected by watchdog at work sequence id: 1 PG status: last enqueued work: 1, last completed work: -1 [rank0]:[E609 00:24:47.190080357 ProcessGroupNCCL.cpp:667] Stack trace of the failed collective not found, potentially because FlightRecorder is disabled. You can enable it by setting TORCH_NCCL_TRACE_BUFFER_SIZE to a non-zero value. [rank1]:[E609 00:24:48.277501819 ProcessGroupNCCL.cpp:629] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. [rank1]:[E609 00:24:48.279945091 ProcessGroupNCCL.cpp:2168] [PG ID 0 PG GUID 0(default_pg) Rank 1] failure detected by watchdog at work sequence id: 1 PG status: last enqueued work: 1, last completed work: -1 [rank1]:[E609 00:24:48.280072432 ProcessGroupNCCL.cpp:667] Stack trace of the failed collective not found, potentially because FlightRecorder is disabled. You can enable it by setting TORCH_NCCL_TRACE_BUFFER_SIZE to a non-zero value. [rank0]: Traceback (most recent call last): [rank0]: File "/musubi-tuner/fpack_train_network.py", line 617, in <module> [rank0]: trainer.train(args) [rank0]: File "/musubi-tuner/hv_train_network.py", line 1648, in train [rank0]: network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/accelerator.py", line 1446, in prepare [rank0]: result = tuple( [rank0]: ^^^^^^ [rank0]: File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/accelerator.py", line 1447, in <genexpr> [rank0]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/accelerator.py", line 1289, in _prepare_one [rank0]: return self.prepare_model(obj, device_placement=device_placement) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/accelerator.py", line 1595, in prepare_model [rank0]: model = torch.nn.parallel.DistributedDataParallel( [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/musubi-tuner/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 825, in __init__ [rank0]: _verify_param_shape_across_processes(self.process_group, parameters) [rank0]: File "/musubi-tuner/venv/lib/python3.11/site-packages/torch/distributed/utils.py", line 294, in _verify_param_shape_across_processes [rank0]: return dist._verify_params_across_processes(process_group, tensors, logger) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: RuntimeError: DDP expects same model across all ranks, but Rank 0 has 880 params, while rank 1 has inconsistent 0 params. [rank1]: Traceback (most recent call last): [rank1]: File "/musubi-tuner/fpack_train_network.py", line 617, in <module> [rank1]: trainer.train(args) [rank1]: File "/musubi-tuner/hv_train_network.py", line 1648, in train [rank1]: network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/accelerator.py", line 1446, in prepare [rank1]: result = tuple( [rank1]: ^^^^^^ [rank1]: File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/accelerator.py", line 1447, in <genexpr> [rank1]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/accelerator.py", line 1289, in _prepare_one [rank1]: return self.prepare_model(obj, device_placement=device_placement) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/accelerator.py", line 1595, in prepare_model [rank1]: model = torch.nn.parallel.DistributedDataParallel( [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/musubi-tuner/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 825, in __init__ [rank1]: _verify_param_shape_across_processes(self.process_group, parameters) [rank1]: File "/musubi-tuner/venv/lib/python3.11/site-packages/torch/distributed/utils.py", line 294, in _verify_param_shape_across_processes [rank1]: return dist._verify_params_across_processes(process_group, tensors, logger) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: RuntimeError: DDP expects same model across all ranks, but Rank 1 has 880 params, while rank 0 has inconsistent 0 params. [rank0]:[E609 00:24:48.714209617 ProcessGroupNCCL.cpp:681] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank0]:[E609 00:24:48.714323378 ProcessGroupNCCL.cpp:695] [Rank 0] To avoid data inconsistency, we are taking the entire process down. [rank0]:[E609 00:24:48.722522951 ProcessGroupNCCL.cpp:1895] [PG ID 0 PG GUID 0(default_pg) Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:632 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x72f281acc788 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libc10.so) frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x23d (0x72f2307d39ad in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x9e8 (0x72f2307d4fa8 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x72f2307d5c4d in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #4: <unknown function> + 0xd8198 (0x72f284b27198 in /opt/conda/bin/../lib/libstdc++.so.6) frame #5: <unknown function> + 0x94ac3 (0x72f2856c6ac3 in /lib/x86_64-linux-gnu/libc.so.6) frame #6: clone + 0x44 (0x72f285757a04 in /lib/x86_64-linux-gnu/libc.so.6) terminate called after throwing an instance of 'c10::DistBackendError' what(): [PG ID 0 PG GUID 0(default_pg) Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:632 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x72f281acc788 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libc10.so) frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x23d (0x72f2307d39ad in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x9e8 (0x72f2307d4fa8 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x72f2307d5c4d in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #4: <unknown function> + 0xd8198 (0x72f284b27198 in /opt/conda/bin/../lib/libstdc++.so.6) frame #5: <unknown function> + 0x94ac3 (0x72f2856c6ac3 in /lib/x86_64-linux-gnu/libc.so.6) frame #6: clone + 0x44 (0x72f285757a04 in /lib/x86_64-linux-gnu/libc.so.6) Exception raised from ncclCommWatchdog at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1901 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x72f281acc788 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libc10.so) frame #1: <unknown function> + 0x10d2c3e (0x72f2307a6c3e in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #2: <unknown function> + 0xd6d5ed (0x72f2304415ed in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #3: <unknown function> + 0xd8198 (0x72f284b27198 in /opt/conda/bin/../lib/libstdc++.so.6) frame #4: <unknown function> + 0x94ac3 (0x72f2856c6ac3 in /lib/x86_64-linux-gnu/libc.so.6) frame #5: clone + 0x44 (0x72f285757a04 in /lib/x86_64-linux-gnu/libc.so.6) [rank1]:[E609 00:24:48.048166525 ProcessGroupNCCL.cpp:681] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank1]:[E609 00:24:48.048284856 ProcessGroupNCCL.cpp:695] [Rank 1] To avoid data inconsistency, we are taking the entire process down. [rank1]:[E609 00:24:48.056881613 ProcessGroupNCCL.cpp:1895] [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:632 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x7da22f57d788 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libc10.so) frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x23d (0x7da1de1d39ad in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x9e8 (0x7da1de1d4fa8 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7da1de1d5c4d in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #4: <unknown function> + 0xd8198 (0x7da2325e2198 in /opt/conda/bin/../lib/libstdc++.so.6) frame #5: <unknown function> + 0x94ac3 (0x7da233181ac3 in /lib/x86_64-linux-gnu/libc.so.6) frame #6: clone + 0x44 (0x7da233212a04 in /lib/x86_64-linux-gnu/libc.so.6) terminate called after throwing an instance of 'c10::DistBackendError' what(): [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=ALLGATHER, NumelIn=1, NumelOut=2, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:632 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x7da22f57d788 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libc10.so) frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x23d (0x7da1de1d39ad in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x9e8 (0x7da1de1d4fa8 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7da1de1d5c4d in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #4: <unknown function> + 0xd8198 (0x7da2325e2198 in /opt/conda/bin/../lib/libstdc++.so.6) frame #5: <unknown function> + 0x94ac3 (0x7da233181ac3 in /lib/x86_64-linux-gnu/libc.so.6) frame #6: clone + 0x44 (0x7da233212a04 in /lib/x86_64-linux-gnu/libc.so.6) Exception raised from ncclCommWatchdog at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1901 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x7da22f57d788 in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libc10.so) frame #1: <unknown function> + 0x10d2c3e (0x7da1de1a6c3e in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #2: <unknown function> + 0xd6d5ed (0x7da1dde415ed in /musubi-tuner/venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so) frame #3: <unknown function> + 0xd8198 (0x7da2325e2198 in /opt/conda/bin/../lib/libstdc++.so.6) frame #4: <unknown function> + 0x94ac3 (0x7da233181ac3 in /lib/x86_64-linux-gnu/libc.so.6) frame #5: clone + 0x44 (0x7da233212a04 in /lib/x86_64-linux-gnu/libc.so.6) W0609 00:24:49.814000 3604 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3673 closing signal SIGTERM E0609 00:24:50.740000 3604 torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -6) local_rank: 0 (pid: 3672) of binary: /musubi-tuner/venv/bin/python Traceback (most recent call last): File "/musubi-tuner/venv/bin/accelerate", line 8, in <module> sys.exit(main()) ^^^^^^ File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main args.func(args) File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1204, in launch_command multi_gpu_launcher(args) File "/musubi-tuner/venv/lib/python3.11/site-packages/accelerate/commands/launch.py", line 825, in multi_gpu_launcher distrib_run.run(args) File "/musubi-tuner/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 909, in run elastic_launch( File "/musubi-tuner/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/musubi-tuner/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ===================================================== fpack_train_network.py FAILED ----------------------------------------------------- Failures: <NO_OTHER_FAILURES> ----------------------------------------------------- Root Cause (first observed failure): [0]: time : 2025-06-09_00:24:49 host : 1a962a38c261 rank : 0 (local_rank: 0) exitcode : -6 (pid: 3672) error_file: <N/A> traceback : Signal 6 (SIGABRT) received by PID 3672 ===================================================== |