From d8a65baedb8a7aff1dfc5eba431a28ef5414668c Mon Sep 17 00:00:00 2001 From: ytl0623 Date: Thu, 22 Jan 2026 14:03:09 +0800 Subject: [PATCH 1/6] Support MIG UUID and respect CUDA_VISIBLE_DEVICES in nnUNetV2Runner Signed-off-by: ytl0623 --- monai/apps/nnunet/nnunetv2_runner.py | 30 +++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/monai/apps/nnunet/nnunetv2_runner.py b/monai/apps/nnunet/nnunetv2_runner.py index 8a10849904..2ed0fc6873 100644 --- a/monai/apps/nnunet/nnunetv2_runner.py +++ b/monai/apps/nnunet/nnunetv2_runner.py @@ -529,17 +529,26 @@ def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int run_cmd(cmd, shell=True) def train_single_model_command(self, config, fold, gpu_id, kwargs): - if isinstance(gpu_id, (tuple, list)): + device_setting = "" + num_gpus = 1 + if isinstance(gpu_id, str): + device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}" + num_gpus = 1 + elif isinstance(gpu_id, (tuple, list)): if len(gpu_id) > 1: - gpu_ids_str = "" - for _i in range(len(gpu_id)): - gpu_ids_str += f"{gpu_id[_i]}," - device_setting = f"CUDA_VISIBLE_DEVICES={gpu_ids_str[:-1]}" - else: + gpu_ids_str = ",".join(str(x) for x in gpu_id) + device_setting = f"CUDA_VISIBLE_DEVICES={gpu_ids_str}" + num_gpus = len(gpu_id) + elif len(gpu_id) == 1: device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id[0]}" + num_gpus = 1 else: device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}" - num_gpus = 1 if isinstance(gpu_id, int) or len(gpu_id) == 1 else len(gpu_id) + num_gpus = 1 + env_cuda = os.environ.get("CUDA_VISIBLE_DEVICES") + if env_cuda is not None and device_setting == "CUDA_VISIBLE_DEVICES=0": + logger.info(f"Using existing environment variable CUDA_VISIBLE_DEVICES='{env_cuda}'") + device_setting = "" cmd = ( f"{device_setting} nnUNetv2_train " @@ -779,7 +788,7 @@ def predict( part_id: int = 0, num_processes_preprocessing: int = -1, num_processes_segmentation_export: int = -1, - gpu_id: int = 0, + gpu_id: int | str = 0, ) -> None: """ Use this to run inference with nnU-Net. This function is used when you want to manually specify a folder containing @@ -815,7 +824,10 @@ def predict( More is not always better. Beware of out-of-RAM issues. gpu_id: which GPU to use for prediction. """ - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" + if "CUDA_VISIBLE_DEVICES" in os.environ and gpu_id == 0: + logger.info(f"Predict: Using existing CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}") + else: + os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" from nnunetv2.inference.predict_from_raw_data import nnUNetPredictor From 834c57acfe52756fe335e2797909aa692c1a2efd Mon Sep 17 00:00:00 2001 From: ytl0623 Date: Thu, 22 Jan 2026 14:22:09 +0800 Subject: [PATCH 2/6] fix bug: string 0 bypasses env var preservation Signed-off-by: ytl0623 --- monai/apps/nnunet/nnunetv2_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/monai/apps/nnunet/nnunetv2_runner.py b/monai/apps/nnunet/nnunetv2_runner.py index 2ed0fc6873..6e2356f231 100644 --- a/monai/apps/nnunet/nnunetv2_runner.py +++ b/monai/apps/nnunet/nnunetv2_runner.py @@ -822,9 +822,11 @@ def predict( num_processes_preprocessing: out-of-RAM issues. num_processes_segmentation_export: Number of processes used for segmentation export. More is not always better. Beware of out-of-RAM issues. - gpu_id: which GPU to use for prediction. + gpu_id: GPU device index (int) or MIG UUID (str) for prediction. + If CUDA_VISIBLE_DEVICES is already set and gpu_id is 0, the existing + environment variable is preserved. """ - if "CUDA_VISIBLE_DEVICES" in os.environ and gpu_id == 0: + if "CUDA_VISIBLE_DEVICES" in os.environ and (gpu_id == 0 or gpu_id == "0"): logger.info(f"Predict: Using existing CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}") else: os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" From fe71b7b363e17ccc3b502c42e0c4dda7c72cc2d7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 22 Jan 2026 06:22:44 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- monai/apps/nnunet/nnunetv2_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/apps/nnunet/nnunetv2_runner.py b/monai/apps/nnunet/nnunetv2_runner.py index 6e2356f231..51c514a940 100644 --- a/monai/apps/nnunet/nnunetv2_runner.py +++ b/monai/apps/nnunet/nnunetv2_runner.py @@ -826,7 +826,7 @@ def predict( If CUDA_VISIBLE_DEVICES is already set and gpu_id is 0, the existing environment variable is preserved. """ - if "CUDA_VISIBLE_DEVICES" in os.environ and (gpu_id == 0 or gpu_id == "0"): + if "CUDA_VISIBLE_DEVICES" in os.environ and (gpu_id == 0 or gpu_id == "0"): logger.info(f"Predict: Using existing CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}") else: os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" From eddddcadcf303006e75c31475f0ea8a0a7454e56 Mon Sep 17 00:00:00 2001 From: ytl0623 Date: Thu, 22 Jan 2026 14:37:12 +0800 Subject: [PATCH 4/6] minor fixes: 1. added docstring and type hints 2. lead whitespace when device_setting is empty 3. empty tuple/list falls through silently Signed-off-by: ytl0623 --- monai/apps/nnunet/nnunetv2_runner.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/monai/apps/nnunet/nnunetv2_runner.py b/monai/apps/nnunet/nnunetv2_runner.py index 51c514a940..c5b930fa99 100644 --- a/monai/apps/nnunet/nnunetv2_runner.py +++ b/monai/apps/nnunet/nnunetv2_runner.py @@ -528,13 +528,29 @@ def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int cmd = self.train_single_model_command(config, fold, gpu_id, kwargs) run_cmd(cmd, shell=True) - def train_single_model_command(self, config, fold, gpu_id, kwargs): + def train_single_model_command( + self, config: str, fold: int, gpu_id: int | str | tuple | list, kwargs: dict[str, Any] + ) -> str: + """ + Build the shell command string for training a single nnU-Net model. + + Args: + config: Configuration name (e.g., "3d_fullres"). + fold: Cross-validation fold index (0-4). + gpu_id: Device selector—int, str (MIG UUID), or tuple/list for multi-GPU. + kwargs: Additional CLI arguments forwarded to nnUNetv2_train. + + Returns: + Shell command string. + """ device_setting = "" num_gpus = 1 if isinstance(gpu_id, str): device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}" num_gpus = 1 elif isinstance(gpu_id, (tuple, list)): + if len(gpu_id) == 0: + raise ValueError("gpu_id tuple/list cannot be empty") if len(gpu_id) > 1: gpu_ids_str = ",".join(str(x) for x in gpu_id) device_setting = f"CUDA_VISIBLE_DEVICES={gpu_ids_str}" @@ -550,8 +566,9 @@ def train_single_model_command(self, config, fold, gpu_id, kwargs): logger.info(f"Using existing environment variable CUDA_VISIBLE_DEVICES='{env_cuda}'") device_setting = "" + prefix = f"{device_setting} " if device_setting else "" cmd = ( - f"{device_setting} nnUNetv2_train " + f"{prefix}nnUNetv2_train " + f"{self.dataset_name_or_id} {config} {fold} " + f"-tr {self.trainer_class_name} -num_gpus {num_gpus}" ) From d053eb4115591403b091d695f9ba866ee6a7a6b6 Mon Sep 17 00:00:00 2001 From: ytl0623 Date: Thu, 22 Jan 2026 15:07:34 +0800 Subject: [PATCH 5/6] add docstring to the Raises section Signed-off-by: ytl0623 --- monai/apps/nnunet/nnunetv2_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/monai/apps/nnunet/nnunetv2_runner.py b/monai/apps/nnunet/nnunetv2_runner.py index c5b930fa99..93165a9537 100644 --- a/monai/apps/nnunet/nnunetv2_runner.py +++ b/monai/apps/nnunet/nnunetv2_runner.py @@ -526,7 +526,7 @@ def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int logger.warning("please specify the `export_validation_probabilities` in the __init__ of `nnUNetV2Runner`.") cmd = self.train_single_model_command(config, fold, gpu_id, kwargs) - run_cmd(cmd, shell=True) + run_cmd(cmd, shell=True) # type: ignore def train_single_model_command( self, config: str, fold: int, gpu_id: int | str | tuple | list, kwargs: dict[str, Any] @@ -542,6 +542,9 @@ def train_single_model_command( Returns: Shell command string. + + Raises: + ValueError: If gpu_id is an empty tuple or list. """ device_setting = "" num_gpus = 1 From d038d2c02d6fccc33f1778c705357c2394af3efa Mon Sep 17 00:00:00 2001 From: ytl0623 Date: Thu, 22 Jan 2026 15:33:40 +0800 Subject: [PATCH 6/6] refactor nnUNet runner for security, MIG support, and type safety Signed-off-by: ytl0623 --- monai/apps/nnunet/nnunetv2_runner.py | 52 ++++++++++++++++------------ 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/monai/apps/nnunet/nnunetv2_runner.py b/monai/apps/nnunet/nnunetv2_runner.py index 93165a9537..ac75d96d94 100644 --- a/monai/apps/nnunet/nnunetv2_runner.py +++ b/monai/apps/nnunet/nnunetv2_runner.py @@ -525,12 +525,12 @@ def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int kwargs.pop("npz") logger.warning("please specify the `export_validation_probabilities` in the __init__ of `nnUNetV2Runner`.") - cmd = self.train_single_model_command(config, fold, gpu_id, kwargs) - run_cmd(cmd, shell=True) # type: ignore + cmd, env = self.train_single_model_command(config, fold, gpu_id, kwargs) + run_cmd(cmd, env=env) def train_single_model_command( self, config: str, fold: int, gpu_id: int | str | tuple | list, kwargs: dict[str, Any] - ) -> str: + ) -> tuple[list[str], dict[str, str]]: """ Build the shell command string for training a single nnU-Net model. @@ -546,43 +546,49 @@ def train_single_model_command( Raises: ValueError: If gpu_id is an empty tuple or list. """ - device_setting = "" + env = os.environ.copy() + device_setting: str | None = None num_gpus = 1 if isinstance(gpu_id, str): - device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}" + device_setting = gpu_id num_gpus = 1 elif isinstance(gpu_id, (tuple, list)): if len(gpu_id) == 0: raise ValueError("gpu_id tuple/list cannot be empty") if len(gpu_id) > 1: - gpu_ids_str = ",".join(str(x) for x in gpu_id) - device_setting = f"CUDA_VISIBLE_DEVICES={gpu_ids_str}" + device_setting = ",".join(str(x) for x in gpu_id) num_gpus = len(gpu_id) elif len(gpu_id) == 1: - device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id[0]}" + device_setting = str(gpu_id[0]) num_gpus = 1 else: - device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}" + device_setting = str(gpu_id) num_gpus = 1 - env_cuda = os.environ.get("CUDA_VISIBLE_DEVICES") - if env_cuda is not None and device_setting == "CUDA_VISIBLE_DEVICES=0": + env_cuda = env.get("CUDA_VISIBLE_DEVICES") + if env_cuda is not None and device_setting == "0": logger.info(f"Using existing environment variable CUDA_VISIBLE_DEVICES='{env_cuda}'") - device_setting = "" - - prefix = f"{device_setting} " if device_setting else "" - cmd = ( - f"{prefix}nnUNetv2_train " - + f"{self.dataset_name_or_id} {config} {fold} " - + f"-tr {self.trainer_class_name} -num_gpus {num_gpus}" - ) + device_setting = None + elif device_setting is not None: + env["CUDA_VISIBLE_DEVICES"] = device_setting + + cmd = [ + "nnUNetv2_train", + f"{self.dataset_name_or_id}", + f"{config}", + f"{fold}", + "-tr", + f"{self.trainer_class_name}", + "-num_gpus", + f"{num_gpus}", + ] if self.export_validation_probabilities: - cmd += " --npz" + cmd.append("--npz") for _key, _value in kwargs.items(): if _key == "p" or _key == "pretrained_weights": - cmd += f" -{_key} {_value}" + cmd.extend([f"-{_key}", f"{_value}"]) else: - cmd += f" --{_key} {_value}" - return cmd + cmd.extend([f"--{_key}", f"{_value}"]) + return cmd, env def train( self,