Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion src/imgtools/autopipeline.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"no output files, check the directory" sounds a bit confusing? what are they checking?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you send the full output

Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def process_one_sample(
SampleNumber=idx,
)
result.output_files = list(saved_files)
if not result.output_files:
if not result.output_files and not sample_output.dry_run:
raise ValueError(
"No output files were saved. Check the output directory."
)
Expand Down Expand Up @@ -305,6 +305,7 @@ def __init__(
spacing: tuple[float, float, float] = (0.0, 0.0, 0.0),
window: float | None = None,
level: float | None = None,
dry_run: bool = False,
) -> None:
"""
Initialize the Autopipeline.
Expand Down Expand Up @@ -342,6 +343,8 @@ def __init__(
Window level for intensity normalization, by default None
level : float | None, optional
Window level for intensity normalization, by default None
dry_run : bool, optional
Whether to run the pipeline in dry run mode, by default False
"""
self.input = SampleInput.build(
directory=Path(input_directory),
Expand All @@ -359,6 +362,7 @@ def __init__(
filename_format=output_filename_format,
existing_file_mode=existing_file_mode,
extra_context={},
dry_run=dry_run,
)

transforms: list[BaseTransform] = [
Expand Down
9 changes: 9 additions & 0 deletions src/imgtools/cli/autopipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,13 @@ def parse_spacing(ctx, param, value): # type: ignore
default=None,
help="Path to YAML file containing ROI matching patterns."
)
@click.option(
"--dry-run",
"-d",
is_flag=True,
default=False,
help="Run the pipeline in dry run mode, by default False"
)
@click.help_option(
"-h",
"--help",
Expand All @@ -186,6 +193,7 @@ def autopipeline(
roi_on_missing_regex: str,
roi_match_map: Tuple[str],
roi_match_yaml: Path,
dry_run: bool,
) -> None:
"""Core utility to process messy DICOM data into organized NIfTI files.

Expand Down Expand Up @@ -263,6 +271,7 @@ def autopipeline(
spacing=spacing,
window=window,
level=level,
dry_run=dry_run,
)

# Run the pipeline
Expand Down
9 changes: 9 additions & 0 deletions src/imgtools/dicom/dicom_metadata/extractor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ class ModalityMetadataExtractor(ABC):
"SeriesInstanceUID",
"StudyInstanceUID",
"Modality",
# Sensitive Patient Demographics (PHI)
# These tags contain Protected Health Information (PHI) and are sensitive.
"PatientSex",
"PatientBirthDate",
"PatientAge",
"EthnicGroup",
"PatientWeight",
"PatientSize",
"AdditionalPatientHistory",
# Image Geometry & Size
"BodyPartExamined",
"DataCollectionDiameter",
Expand Down
8 changes: 7 additions & 1 deletion src/imgtools/io/sample_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@ class SampleOutput(BaseModel):
{"dataset": "NSCLC-Radiomics", "processing_date": "2025-04-22"}
],
)

dry_run: bool = Field(
default=False,
description="Whether to run the pipeline in dry run mode, by default False. Will generate the output index listing images and specified masks with ROIs.",
title="Dry Run",
)
_writer: AbstractBaseWriter | None = PrivateAttr(default=None)

def model_post_init(self, __context) -> None: # type: ignore # noqa: ANN001
Expand All @@ -141,6 +145,8 @@ def model_post_init(self, __context) -> None: # type: ignore # noqa: ANN001
existing_file_mode=self.existing_file_mode,
filename_format=self.filename_format,
context=self.extra_context,
dry_run=self.dry_run,
create_dirs=not self.dry_run, # only create directories if not in dry run mode
)

@field_validator("directory")
Expand Down
33 changes: 29 additions & 4 deletions src/imgtools/io/writers/abstract_base_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ class AbstractBaseWriter(ABC, Generic[ContentType]):
absolute_paths_in_index : bool, default=False
If True, saves absolute paths in the index file.
If False, saves paths relative to the root directory.
dry_run : bool, default=False
If True, skip the actual file I/O operations but still write to the index
and return the resolved path. Useful for previewing what would be saved
or updating the index without writing files.
pattern_resolver : PatternResolver
Instance used to handle filename formatting with placeholders.

Expand All @@ -152,6 +156,7 @@ class AbstractBaseWriter(ABC, Generic[ContentType]):
pattern_resolver: PatternResolver = field(init=False)
overwrite_index: bool = field(default=False)
absolute_paths_in_index: bool = field(default=False)
dry_run: bool = field(default=False)

index_filename: Optional[str] = field(default=None)
_checked_directories: set[str] = field(default_factory=set, init=False)
Expand Down Expand Up @@ -212,6 +217,21 @@ def save(self, data: ContentType, **kwargs: Any) -> Path:
updating them with the kwargs passed from the save method.

This will help simplify repeated saves with similar context variables.

When `self.dry_run` is True, implementations should skip file I/O operations
but still write to the index and return the resolved path.

Parameters
----------
data : ContentType
The data to be saved.
**kwargs : Any
Additional context variables for filename generation and path resolution.

Returns
-------
Path
The resolved path where the file would be (or was) saved.
"""
pass

Expand Down Expand Up @@ -332,6 +352,10 @@ def resolve_path(self, **kwargs: object) -> Path:
# msg = f"Directory {out_path.parent} does not exist."
# raise DirectoryNotFoundError(msg)
return out_path

if self.dry_run: # This is here so that the existing_file_mode logic is not executed in dry run mode
return out_path

match self.existing_file_mode:
case ExistingFileMode.SKIP:
return out_path
Expand Down Expand Up @@ -612,7 +636,7 @@ def save(self, data: str, **kwargs: object) -> Path:

Parameters
----------
content : str
data : str
The content to write to the file.
**kwargs : Any
Additional context for filename generation.
Expand All @@ -625,9 +649,10 @@ def save(self, data: str, **kwargs: object) -> Path:
# Resolve the output file path
output_path = self.resolve_path(**kwargs)

# Write content to the file
with output_path.open(mode="w", encoding="utf-8") as f:
f.write(data)
# Write content to the file only if not in dry_run mode
if not self.dry_run:
with output_path.open(mode="w", encoding="utf-8") as f:
f.write(data)

self.add_to_index(output_path, replace_existing=output_path.exists())

Expand Down
39 changes: 20 additions & 19 deletions src/imgtools/io/writers/nifti_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,25 +195,26 @@ def save(self, data: sitk.Image | np.ndarray, **kwargs: object) -> Path:
else:
out_path = self.resolve_path(**kwargs)

if (
out_path.exists() # check if it exists
# This will only be true if SKIP,
# OVERWRITE would have deleted the file
and self.existing_file_mode == ExistingFileMode.SKIP
):
logger.debug("File exists, skipping.", out_path=out_path)
return out_path

try:
sitk.WriteImage(
image,
out_path.as_posix(),
useCompression=True,
compressionLevel=self.compression_level,
)
except Exception as e:
msg = f"Error writing image to file {out_path}: {e}"
raise NiftiWriterIOError(msg) from e
if not self.dry_run:
if (
out_path.exists() # check if it exists
# This will only be true if SKIP,
# OVERWRITE would have deleted the file
and self.existing_file_mode == ExistingFileMode.SKIP
):
logger.debug("File exists, skipping.", out_path=out_path)
return out_path

try:
sitk.WriteImage(
image,
out_path.as_posix(),
useCompression=True,
compressionLevel=self.compression_level,
)
except Exception as e:
msg = f"Error writing image to file {out_path}: {e}"
raise NiftiWriterIOError(msg) from e

self.add_to_index(
out_path,
Expand Down
49 changes: 27 additions & 22 deletions src/imgtools/io/writers/numpy_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ def save(
----------
data : np.ndarray | sitk.Image | dict[str, np.ndarray | sitk.Image]
The data to save. Can be a single image or a dictionary of images.
**kwargs : object
Additional context for filename generation.
Returns
-------
Expand All @@ -95,29 +97,32 @@ def save(
"""
out_path = self.resolve_path(**kwargs)

if isinstance(data, (np.ndarray, sitk.Image)):
# Single image or array
array, metadata = self._to_numpy(data)
np.savez_compressed(out_path, image_array=array, **metadata)
elif isinstance(data, dict):
# Multiple images or arrays
arrays = {}
metadata = {}
for key, value in data.items():
array, meta = self._to_numpy(value)
arrays[key] = array
for meta_key, meta_value in meta.items():
metadata[f"{key}_{meta_key}"] = meta_value
if self.compressed:
np.savez_compressed(
out_path, allow_pickle=False, **arrays, **metadata
)
if not self.dry_run:
if isinstance(data, (np.ndarray, sitk.Image)):
# Single image or array
array, metadata = self._to_numpy(data)
np.savez_compressed(out_path, image_array=array, **metadata)
elif isinstance(data, dict):
# Multiple images or arrays
arrays = {}
metadata = {}
for key, value in data.items():
array, meta = self._to_numpy(value)
arrays[key] = array
for meta_key, meta_value in meta.items():
metadata[f"{key}_{meta_key}"] = meta_value
if self.compressed:
np.savez_compressed(
out_path, allow_pickle=False, **arrays, **metadata
)
else:
np.savez(
out_path, allow_pickle=False, **arrays, **metadata
)
else:
np.savez(out_path, allow_pickle=False, **arrays, **metadata)
else:
raise NumpyWriterValidationError(
"Data must be a NumPy array, SimpleITK image, or a dictionary of these types."
)
raise NumpyWriterValidationError(
"Data must be a NumPy array, SimpleITK image, or a dictionary of these types."
)
Comment on lines +100 to +125
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Preserve data validation behavior under dry_run and optionally centralize error message

The dry-run guard cleanly skips actual filesystem writes, but it currently also skips all type validation:

  • The isinstance checks and _to_numpy calls only run when not self.dry_run.
  • If dry_run=True and data is an unsupported type, no NumpyWriterValidationError is raised; instead, the method quietly indexes a path and returns it. That makes it easy for a dry-run to “succeed” while a real run would fail at write time.

To keep dry-run useful for catching pipeline issues while still avoiding I/O, consider validating data independently of the write, e.g.:

-        out_path = self.resolve_path(**kwargs)
-
-        if not self.dry_run:
-            if isinstance(data, (np.ndarray, sitk.Image)):
+        out_path = self.resolve_path(**kwargs)
+
+        # Validate input type regardless of dry-run to surface misuses early.
+        if isinstance(data, (np.ndarray, sitk.Image)):
+            to_save = ("single", data)
+        elif isinstance(data, dict):
+            to_save = ("dict", data)
+        else:
+            raise NumpyWriterValidationError(
+                "Data must be a NumPy array, SimpleITK image, or a dictionary of these types."
+            )
+
+        if not self.dry_run:
+            kind, payload = to_save
+            if kind == "single":
+                data_single = payload
                 # Single image or array
-                array, metadata = self._to_numpy(data)
+                array, metadata = self._to_numpy(data_single)
                 np.savez_compressed(out_path, image_array=array, **metadata)
-            elif isinstance(data, dict):
+            else:  # dict
                 # Multiple images or arrays
-                arrays = {}
-                metadata = {}
-                for key, value in data.items():
+                arrays: dict[str, np.ndarray] = {}
+                metadata: dict[str, object] = {}
+                for key, value in payload.items():
                 ...
-            else:
-                raise NumpyWriterValidationError(
-                    "Data must be a NumPy array, SimpleITK image, or a dictionary of these types."
-                )

This keeps validation semantics identical between normal and dry-run modes while still avoiding writes.

Optionally, to satisfy TRY003 and improve reuse, you could move the long error message into a class-level constant on NumpyWriterValidationError or NumPyWriter and reference it from both _to_numpy and save, but that’s more of a style/readability tweak than a functional requirement.

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Ruff (0.14.5)

123-125: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In src/imgtools/io/writers/numpy_writer.py around lines 100 to 125, the current
dry-run branch skips all type validation so unsupported input types never raise
NumpyWriterValidationError during dry runs; change the flow to always validate
input (perform isinstance checks and call self._to_numpy or a dedicated
validation helper) before checking self.dry_run, then only skip the actual
np.savez/np.savez_compressed when dry_run is True; optionally extract the long
error message into a class-level constant on NumpyWriter or
NumpyWriterValidationError and reuse it in both _to_numpy and save to avoid
duplication.


self.add_to_index(
out_path,
Expand Down
Loading