[rank0]: Traceback (most recent call last): [rank0]: File "/workspace/kohya_ss/sd-scripts/flux_train.py", line 918, in [rank0]: train(args) [rank0]: File "/workspace/kohya_ss/sd-scripts/flux_train.py", line 432, in train [rank0]: flux = accelerator.prepare(flux, device_placement=[not is_swapping_blocks]) [rank0]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 1311, in prepare [rank0]: result = tuple( [rank0]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 1312, in [rank0]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement) [rank0]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 1188, in _prepare_one [rank0]: return self.prepare_model(obj, device_placement=device_placement) [rank0]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 1452, in prepare_model [rank0]: model = torch.nn.parallel.DistributedDataParallel( [rank0]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 739, in __init__ [rank0]: self._log_and_throw( [rank0]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1127, in _log_and_throw [rank0]: raise err_type(err_msg) [rank0]: ValueError: DistributedDataParallel device_ids and output_device arguments only work with single-device/multiple-device GPU modules or CPU modules, but got device_ids [0], output_device 0, and module parameters {device(type='cpu')}. [rank1]: Traceback (most recent call last): [rank1]: File "/workspace/kohya_ss/sd-scripts/flux_train.py", line 918, in [rank1]: train(args) [rank1]: File "/workspace/kohya_ss/sd-scripts/flux_train.py", line 432, in train [rank1]: flux = accelerator.prepare(flux, device_placement=[not is_swapping_blocks]) [rank1]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 1311, in prepare [rank1]: result = tuple( [rank1]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 1312, in [rank1]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement) [rank1]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 1188, in _prepare_one [rank1]: return self.prepare_model(obj, device_placement=device_placement) [rank1]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 1452, in prepare_model [rank1]: model = torch.nn.parallel.DistributedDataParallel( [rank1]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 739, in __init__ [rank1]: self._log_and_throw( [rank1]: File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1127, in _log_and_throw [rank1]: raise err_type(err_msg) [rank1]: ValueError: DistributedDataParallel device_ids and output_device arguments only work with single-device/multiple-device GPU modules or CPU modules, but got device_ids [1], output_device 1, and module parameters {device(type='cpu')}. W0920 12:50:23.568000 140317697544192 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1369 closing signal SIGTERM E0920 12:50:23.682000 140317697544192 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 1368) of binary: /workspace/kohya_ss/venv/bin/python Traceback (most recent call last): File "/workspace/kohya_ss/venv/bin/accelerate", line 8, in sys.exit(main()) File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main args.func(args) File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1097, in launch_command multi_gpu_launcher(args) File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/accelerate/commands/launch.py", line 734, in multi_gpu_launcher distrib_run.run(args) File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run elastic_launch( File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/workspace/kohya_ss/venv/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ /workspace/kohya_ss/sd-scripts/flux_train.py FAILED ------------------------------------------------------------ Failures: ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-09-20_12:50:23 host : 5e87f86b425f rank : 0 (local_rank: 0) exitcode : 1 (pid: 1368) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html