diff --git a/colorizer_data/converter.py b/colorizer_data/converter.py index 9d3005c..1f84ad1 100644 --- a/colorizer_data/converter.py +++ b/colorizer_data/converter.py @@ -24,9 +24,10 @@ ) from colorizer_data.utils import ( INITIAL_INDEX_COLUMN, - _get_frame_count_from_3d_source, + get_frame_count_from_3d_source, configureLogging, generate_frame_paths, + get_relative_path_or_url, get_total_objects, merge_dictionaries, read_data_array_file, @@ -368,22 +369,66 @@ def _validate_manifest(writer: ColorizerDatasetWriter): ) +def _format_and_validate_file_source( + name: str, path: str | None, out_dir: pathlib.Path +) -> str: + relative_path = get_relative_path_or_url(out_dir, path) + if relative_path is None: + raise ValueError( + f"{name} '{path}' must be an HTTP(S) URL or inside the output directory '{out_dir}'." + ) + if not ( + relative_path.startswith("http://") or relative_path.startswith("https://") + ): + full_path = pathlib.Path(out_dir / relative_path).resolve() + if not full_path.exists(): + raise FileNotFoundError(f"{name} '{full_path}' does not exist.") + return relative_path + + +def _validate_frames_3d_paths(data: Frames3dMetadata, outpath: pathlib.Path) -> None: + if data.source is None: + raise ValueError("Frames3dMetadata.source must be defined.") + data.source = _format_and_validate_file_source( + "Frames3dMetadata.source", data.source, outpath + ) + + # Repeat for the backdrops, if defined + if data.backdrops is not None: + for i in range(len(data.backdrops)): + if data.backdrops[i].source is None: + raise ValueError( + f"Frames3dMetadata.backdrops[{i}].source must be defined." + ) + data.backdrops[i].source = _format_and_validate_file_source( + f"Frames3dMetadata.backdrops[{i}].source", + data.backdrops[i].source, + outpath, + ) + + def _handle_3d_frames( data: DataFrame, writer: ColorizerDatasetWriter, config: ConverterConfig ) -> None: + source = config.frames_3d.source # Check for 3D frame src (TODO: safe to assume Zarr?) + if source is None: + raise ValueError("Frames3dMetadata.source cannot be None.") + # If 3D frame src is provided, go to 3D source (using bioio) and check the number of frames. fallback_frame_count = int(data[config.times_column].max()) + 1 if config.frames_3d.total_frames is None: try: - config.frames_3d.total_frames = _get_frame_count_from_3d_source( + config.frames_3d.total_frames = get_frame_count_from_3d_source( config.frames_3d.source ) except Exception as e: - logging.error( - f"Error getting frame count from 3D source: {e}. Using fallback frame count of {fallback_frame_count}." + logging.warning( + f"Encountered an error while getting frame count from 3D source; auto-detected fallback frame count ({fallback_frame_count} frames) will be used instead. If this is incorrect, please define 'Frames3dMetadata.totalFrames'. Error: {e}. " ) config.frames_3d.total_frames = fallback_frame_count + + _validate_frames_3d_paths(config.frames_3d, writer.outpath) writer.set_3d_frame_data(config.frames_3d) @@ -443,7 +488,9 @@ def convert_colorizer_data( be flattened along the Z-axis using a max projection. If `None`, 2D frame generation will be skipped. frames_3d (Frames3dMetadata | None): A `Frames3dMetadata` object containing the 3D image source - ("source") and channel ("segmentation_channel") to use for the 3D image source. + ("source") and channel ("segmentation_channel") to use for the 3D image source. The source + must be either a URL or a path inside the output dataset directory. If `None`, 3D frames + will not be included in the dataset. centroid_x_column (str): The name of the column containing x-coordinates of object centroids, in pixels relative to the frame image, where 0 is the left edge of the image. Defaults to "Centroid X." @@ -587,7 +634,7 @@ def convert_colorizer_data( logging.info( "No image column provided, so 2D frame generation will be skipped." ) - elif image_column not in data.columns: + elif image_column not in data.columns or data[image_column].isnull().all(): logging.warning( f"Image column '{image_column}' not found in the dataset. 2D frame generation will be skipped." ) diff --git a/colorizer_data/utils.py b/colorizer_data/utils.py index 18fc6f1..316fe02 100644 --- a/colorizer_data/utils.py +++ b/colorizer_data/utils.py @@ -772,7 +772,65 @@ def get_duplicate_items(input: List[str]) -> List[str]: return [item for item, count in collections.Counter(input).items() if count > 1] -def _get_frame_count_from_3d_source(source: str) -> int: +def get_frame_count_from_3d_source(source: str) -> int: # Attempt to read the image to get info (such as length) img = BioImage(source) return int(img.dims.T) + + +def is_url(source: str) -> bool: + """ + Checks if a source string is an HTTP(S) URL. + """ + return source.startswith("http://") or source.startswith("https://") + + +def check_file_source(name: str, source: str | None, outpath: pathlib.Path) -> None: + """ + Logs warnings for missing or unreachable file sources. + """ + if source is None: + logging.error(f"{name} is undefined.") + elif not is_url(source): + # Check for absolute paths, parent paths, or missing files/folders. + if os.path.isabs(source): + logging.error( + f"{name} must be a relative path inside the dataset directory or an HTTP(S) URL. Received: '{source}'" + ) + elif ".." in pathlib.Path(source).parts: + logging.warning( + f"{name} should not contain parent directory references ('..'), as it may fail to load in certain deploy environments. Received: '{source}'" + ) + elif not os.path.exists(outpath / source): + logging.warning( + f"{name} path could not be found. Please check that it exists. Received: '{source}'" + ) + + +def get_relative_path_or_url(directory_path: pathlib.Path, source: str) -> str | None: + """ + Validates and formats a path or URL from a source file string. Paths will be + made relative to `directory_path` if they are inside of it. Paths outside of + `directory_path` will return `None`. URLs will be returned as-is. + + Args: + - directory_path (pathlib.Path): The base directory path (usually + `writer.outpath`). + - path (str): The path or URL to validate and format. + + Returns: + - If the provided path is a path inside of the `directory_path`, returns + the relative path from `directory_path` to `path`. + - If `path` is a URL, returns it. + - `None` if `path` is a file path but is not inside of `directory_path`. + """ + is_url = source.startswith("http://") or source.startswith("https://") + if not is_url: + # Check if path is inside dataset directory and fix to make relative + source = pathlib.Path(source).resolve() + if not os.path.isabs(source): + source = pathlib.Path(directory_path / source).resolve() + if directory_path not in source.parents: + return None + source = source.relative_to(directory_path).as_posix() + return source diff --git a/colorizer_data/writer.py b/colorizer_data/writer.py index 086610f..3f4bb3c 100644 --- a/colorizer_data/writer.py +++ b/colorizer_data/writer.py @@ -21,7 +21,8 @@ from colorizer_data.utils import ( DEFAULT_FRAME_PREFIX, DEFAULT_FRAME_SUFFIX, - _get_frame_count_from_3d_source, + get_frame_count_from_3d_source, + check_file_source, cast_feature_to_info_type, copy_remote_or_local_file, generate_frame_paths, @@ -507,11 +508,13 @@ def set_frame_paths(self, paths: List[str]) -> None: self.manifest["frames"] = paths def set_3d_frame_data(self, data: Frames3dMetadata) -> None: + # Data validation if data.total_frames is None: logging.warning( "ColorizerDatasetWriter: The `total_frames` property of the Frames3dMetadata object is `None`. Will attempt to infer the number of frames from the provided data." ) - data.total_frames = _get_frame_count_from_3d_source(data.source) + data.total_frames = get_frame_count_from_3d_source(data.source) + self.manifest["frames3d"] = data.to_dict() # TODO: when DatasetManifest is a dataclass, it can serialize Frames3dMetadata directly # instead of needing to call to_dict() here. @@ -711,3 +714,16 @@ def validate_dataset( + " or add an offset if your frame numbers do not start at 0." + " You may also need to generate the list of frames yourself if your dataset is skipping frames." ) + + # Check that frames3d sources are reachable + if "frames3d" in self.manifest: + frames3d_metadata = Frames3dMetadata.from_dict(self.manifest["frames3d"]) + source = frames3d_metadata.source + check_file_source("3D frames source", source, self.outpath) + # Validate backdrops + if frames3d_metadata.backdrops is not None: + for i in range(len(frames3d_metadata.backdrops)): + backdrop_source = frames3d_metadata.backdrops[i].source + check_file_source( + f"3D frames backdrop {i} source", backdrop_source, self.outpath + ) diff --git a/tests/test_converter.py b/tests/test_converter.py index 6597e34..69a4da4 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -9,7 +9,12 @@ from typing import Dict, List, Union from colorizer_data import convert_colorizer_data -from colorizer_data.types import DataFileType, FeatureMetadata, Frames3dMetadata +from colorizer_data.types import ( + Backdrop3dMetadata, + DataFileType, + FeatureMetadata, + Frames3dMetadata, +) from colorizer_data.utils import read_data_array_file asset_path = pathlib.Path(__file__).parent / "assets" @@ -432,28 +437,182 @@ def test_skips_frame_generation_if_no_image_column(existing_dataset): # ///////////////////////// 3D FRAME TESTS ///////////////////////// -def test_writes_3d_data(tmp_path): - csv_content = f"{sample_csv_headers}\n{sample_csv_data}" - csv_data = pd.read_csv(StringIO(csv_content)) +class TestFrames3DWriting: + def convert_3d_data_with_source(self, tmp_path, source: str): + csv_content = f"{sample_csv_headers}\n{sample_csv_data}" + csv_data = pd.read_csv(StringIO(csv_content)) - convert_colorizer_data( - csv_data, - tmp_path, - frames_3d=Frames3dMetadata( - source="https://example.com/3d.ome.zarr", segmentation_channel=1 - ), - ) + convert_colorizer_data( + csv_data, + tmp_path, + image_column=None, + frames_3d=Frames3dMetadata( + source=source, segmentation_channel=1, total_frames=2 + ), + ) - with open(tmp_path / "manifest.json", "r") as f: - manifest = json.load(f) - assert manifest["frames3d"] == { - "source": "https://example.com/3d.ome.zarr", + def make_3d_manifest(self, source: str): + return { + "source": source, "segmentationChannel": 1, # Total frames derived from times array if data source does not exist "totalFrames": 2, "backdrops": None, } + def test_3d_allows_source_urls(self, tmp_path): + self.convert_3d_data_with_source(tmp_path, "https://example.com/3d.ome.zarr") + with open(tmp_path / "manifest.json", "r") as f: + manifest = json.load(f) + assert manifest["frames3d"] == self.make_3d_manifest( + "https://example.com/3d.ome.zarr" + ) + + def test_3d_writes_local_paths(self, tmp_path): + os.chdir(tmp_path) + os.makedirs(tmp_path / "data.zarr") + self.convert_3d_data_with_source(tmp_path, "data.zarr") + with open(tmp_path / "manifest.json", "r") as f: + manifest = json.load(f) + assert manifest["frames3d"] == self.make_3d_manifest("data.zarr") + + def test_3d_allows_absolute_paths(self, tmp_path): + os.chdir(tmp_path) + os.makedirs(tmp_path / "data.zarr") + abs_path = tmp_path / "data.zarr" + self.convert_3d_data_with_source(tmp_path, str(abs_path)) + with open(tmp_path / "manifest.json", "r") as f: + manifest = json.load(f) + # should transform into relative path + assert manifest["frames3d"] == self.make_3d_manifest("data.zarr") + + def test_3d_throws_error_for_none_path(self, tmp_path): + with pytest.raises(ValueError): + self.convert_3d_data_with_source(tmp_path, None) + + def test_3d_throws_error_for_missing_paths(self, tmp_path): + os.chdir(tmp_path) + with pytest.raises(FileNotFoundError): + self.convert_3d_data_with_source(tmp_path, "data.zarr") + + def test_3d_throws_error_for_paths_outside_dir(self, tmp_path): + os.chdir(tmp_path) + # Create a zarr outside of the directory + os.makedirs(tmp_path / "data.zarr") + + csv_content = f"{sample_csv_headers}\n{sample_csv_data}" + csv_data = pd.read_csv(StringIO(csv_content)) + + with pytest.raises(ValueError): + convert_colorizer_data( + csv_data, + tmp_path / "dataset", + image_column=None, + frames_3d=Frames3dMetadata( + source="../data.zarr", segmentation_channel=1, total_frames=2 + ), + ) + + def test_3d_writes_backdrop_paths(self, tmp_path): + os.chdir(tmp_path) + os.makedirs(tmp_path / "data.zarr") + os.makedirs(tmp_path / "backdrop1.zarr") + os.makedirs(tmp_path / "backdrop2.zarr") + + csv_content = f"{sample_csv_headers}\n{sample_csv_data}" + csv_data = pd.read_csv(StringIO(csv_content)) + + convert_colorizer_data( + csv_data, + tmp_path, + image_column=None, + frames_3d=Frames3dMetadata( + source="data.zarr", + segmentation_channel=0, + total_frames=2, + backdrops=[ + Backdrop3dMetadata( + name="Backdrop 1", + source="backdrop1.zarr", + channel_index=0, + min=0, + max=10, + ), + Backdrop3dMetadata( + name="Backdrop 2", + source="backdrop1.zarr", + channel_index=1, + min=255, + max=512, + ), + Backdrop3dMetadata( + name="Backdrop 3", + source="backdrop2.zarr", + channel_index=3, + min=-100, + max=100, + ), + ], + ), + ) + with open(tmp_path / "manifest.json", "r") as f: + assert json.load(f)["frames3d"] == { + "source": "data.zarr", + "segmentationChannel": 0, + # Total frames derived from times array if data source does not exist + "totalFrames": 2, + "backdrops": [ + { + "name": "Backdrop 1", + "source": "backdrop1.zarr", + "channelIndex": 0, + "min": 0, + "max": 10, + }, + { + "name": "Backdrop 2", + "source": "backdrop1.zarr", + "channelIndex": 1, + "min": 255, + "max": 512, + }, + { + "name": "Backdrop 3", + "source": "backdrop2.zarr", + "channelIndex": 3, + "min": -100, + "max": 100, + }, + ], + } + + def test_3d_throws_error_for_backdrop_paths_outside_dir(self, tmp_path): + os.chdir(tmp_path) + os.makedirs(tmp_path / "dataset" / "data.zarr") + os.makedirs(tmp_path / "backdrop.zarr") + + csv_content = f"{sample_csv_headers}\n{sample_csv_data}" + csv_data = pd.read_csv(StringIO(csv_content)) + + with pytest.raises(ValueError): + convert_colorizer_data( + csv_data, + tmp_path / "dataset", + image_column=None, + frames_3d=Frames3dMetadata( + source="data.zarr", + segmentation_channel=1, + total_frames=2, + backdrops=[ + Backdrop3dMetadata( + name="Invalid Backdrop", + source="../backdrop.zarr", + channel_index=1, + ) + ], + ), + ) + # ///////////////////////// BACKDROP TESTS /////////////////////////