Skip to content
Draft
59 changes: 53 additions & 6 deletions colorizer_data/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
)
from colorizer_data.utils import (
INITIAL_INDEX_COLUMN,
_get_frame_count_from_3d_source,
get_frame_count_from_3d_source,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed this function

configureLogging,
generate_frame_paths,
get_relative_path_or_url,
get_total_objects,
merge_dictionaries,
read_data_array_file,
Expand Down Expand Up @@ -368,22 +369,66 @@ def _validate_manifest(writer: ColorizerDatasetWriter):
)


def _format_and_validate_file_source(
name: str, path: str | None, out_dir: pathlib.Path
) -> str:
relative_path = get_relative_path_or_url(out_dir, path)
if relative_path is None:
raise ValueError(
f"{name} '{path}' must be an HTTP(S) URL or inside the output directory '{out_dir}'."
)
if not (
relative_path.startswith("http://") or relative_path.startswith("https://")
):
full_path = pathlib.Path(out_dir / relative_path).resolve()
if not full_path.exists():
raise FileNotFoundError(f"{name} '{full_path}' does not exist.")
return relative_path


def _validate_frames_3d_paths(data: Frames3dMetadata, outpath: pathlib.Path) -> None:
if data.source is None:
raise ValueError("Frames3dMetadata.source must be defined.")
data.source = _format_and_validate_file_source(
"Frames3dMetadata.source", data.source, outpath
)

# Repeat for the backdrops, if defined
if data.backdrops is not None:
for i in range(len(data.backdrops)):
if data.backdrops[i].source is None:
raise ValueError(
f"Frames3dMetadata.backdrops[{i}].source must be defined."
)
data.backdrops[i].source = _format_and_validate_file_source(
f"Frames3dMetadata.backdrops[{i}].source",
data.backdrops[i].source,
outpath,
)


def _handle_3d_frames(
data: DataFrame, writer: ColorizerDatasetWriter, config: ConverterConfig
) -> None:
source = config.frames_3d.source
# Check for 3D frame src (TODO: safe to assume Zarr?)
if source is None:
raise ValueError("Frames3dMetadata.source cannot be None.")

# If 3D frame src is provided, go to 3D source (using bioio) and check the number of frames.
fallback_frame_count = int(data[config.times_column].max()) + 1
if config.frames_3d.total_frames is None:
try:
config.frames_3d.total_frames = _get_frame_count_from_3d_source(
config.frames_3d.total_frames = get_frame_count_from_3d_source(
config.frames_3d.source
)
except Exception as e:
logging.error(
f"Error getting frame count from 3D source: {e}. Using fallback frame count of {fallback_frame_count}."
logging.warning(
f"Encountered an error while getting frame count from 3D source; auto-detected fallback frame count ({fallback_frame_count} frames) will be used instead. If this is incorrect, please define 'Frames3dMetadata.totalFrames'. Error: {e}. "
)
config.frames_3d.total_frames = fallback_frame_count

_validate_frames_3d_paths(config.frames_3d, writer.outpath)
writer.set_3d_frame_data(config.frames_3d)


Expand Down Expand Up @@ -443,7 +488,9 @@ def convert_colorizer_data(
be flattened along the Z-axis using a max projection. If `None`, 2D frame generation
will be skipped.
frames_3d (Frames3dMetadata | None): A `Frames3dMetadata` object containing the 3D image source
("source") and channel ("segmentation_channel") to use for the 3D image source.
("source") and channel ("segmentation_channel") to use for the 3D image source. The source
must be either a URL or a path inside the output dataset directory. If `None`, 3D frames
will not be included in the dataset.
Comment on lines +491 to +493
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have an idea for supporting files outside the output dataset directory via symlinks, which I'll open as pt. 3 of this chain. For now, enforcing that paths must be inside the output dataset directory (or be a URL) will prevent loading errors, but it potentially requires users to move their (huge) Zarr data around.

centroid_x_column (str): The name of the column containing x-coordinates of object
centroids, in pixels relative to the frame image, where 0 is the left edge of the
image. Defaults to "Centroid X."
Expand Down Expand Up @@ -587,7 +634,7 @@ def convert_colorizer_data(
logging.info(
"No image column provided, so 2D frame generation will be skipped."
)
elif image_column not in data.columns:
elif image_column not in data.columns or data[image_column].isnull().all():
logging.warning(
f"Image column '{image_column}' not found in the dataset. 2D frame generation will be skipped."
)
Expand Down
60 changes: 59 additions & 1 deletion colorizer_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,65 @@ def get_duplicate_items(input: List[str]) -> List[str]:
return [item for item, count in collections.Counter(input).items() if count > 1]


def _get_frame_count_from_3d_source(source: str) -> int:
def get_frame_count_from_3d_source(source: str) -> int:
# Attempt to read the image to get info (such as length)
img = BioImage(source)
return int(img.dims.T)


def is_url(source: str) -> bool:
"""
Checks if a source string is an HTTP(S) URL.
"""
return source.startswith("http://") or source.startswith("https://")


def check_file_source(name: str, source: str | None, outpath: pathlib.Path) -> None:
"""
Logs warnings for missing or unreachable file sources.
"""
if source is None:
logging.error(f"{name} is undefined.")
elif not is_url(source):
# Check for absolute paths, parent paths, or missing files/folders.
if os.path.isabs(source):
logging.error(
f"{name} must be a relative path inside the dataset directory or an HTTP(S) URL. Received: '{source}'"
)
elif ".." in pathlib.Path(source).parts:
logging.warning(
f"{name} should not contain parent directory references ('..'), as it may fail to load in certain deploy environments. Received: '{source}'"
)
elif not os.path.exists(outpath / source):
logging.warning(
f"{name} path could not be found. Please check that it exists. Received: '{source}'"
)


def get_relative_path_or_url(directory_path: pathlib.Path, source: str) -> str | None:
"""
Validates and formats a path or URL from a source file string. Paths will be
made relative to `directory_path` if they are inside of it. Paths outside of
`directory_path` will return `None`. URLs will be returned as-is.

Args:
- directory_path (pathlib.Path): The base directory path (usually
`writer.outpath`).
- path (str): The path or URL to validate and format.

Returns:
- If the provided path is a path inside of the `directory_path`, returns
the relative path from `directory_path` to `path`.
- If `path` is a URL, returns it.
- `None` if `path` is a file path but is not inside of `directory_path`.
"""
is_url = source.startswith("http://") or source.startswith("https://")
if not is_url:
# Check if path is inside dataset directory and fix to make relative
source = pathlib.Path(source).resolve()
if not os.path.isabs(source):
source = pathlib.Path(directory_path / source).resolve()
if directory_path not in source.parents:
return None
source = source.relative_to(directory_path).as_posix()
return source
20 changes: 18 additions & 2 deletions colorizer_data/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
from colorizer_data.utils import (
DEFAULT_FRAME_PREFIX,
DEFAULT_FRAME_SUFFIX,
_get_frame_count_from_3d_source,
get_frame_count_from_3d_source,
check_file_source,
cast_feature_to_info_type,
copy_remote_or_local_file,
generate_frame_paths,
Expand Down Expand Up @@ -507,11 +508,13 @@ def set_frame_paths(self, paths: List[str]) -> None:
self.manifest["frames"] = paths

def set_3d_frame_data(self, data: Frames3dMetadata) -> None:
# Data validation
if data.total_frames is None:
logging.warning(
"ColorizerDatasetWriter: The `total_frames` property of the Frames3dMetadata object is `None`. Will attempt to infer the number of frames from the provided data."
)
data.total_frames = _get_frame_count_from_3d_source(data.source)
data.total_frames = get_frame_count_from_3d_source(data.source)

self.manifest["frames3d"] = data.to_dict()
# TODO: when DatasetManifest is a dataclass, it can serialize Frames3dMetadata directly
# instead of needing to call to_dict() here.
Expand Down Expand Up @@ -711,3 +714,16 @@ def validate_dataset(
+ " or add an offset if your frame numbers do not start at 0."
+ " You may also need to generate the list of frames yourself if your dataset is skipping frames."
)

# Check that frames3d sources are reachable
if "frames3d" in self.manifest:
frames3d_metadata = Frames3dMetadata.from_dict(self.manifest["frames3d"])
source = frames3d_metadata.source
check_file_source("3D frames source", source, self.outpath)
# Validate backdrops
if frames3d_metadata.backdrops is not None:
for i in range(len(frames3d_metadata.backdrops)):
backdrop_source = frames3d_metadata.backdrops[i].source
check_file_source(
f"3D frames backdrop {i} source", backdrop_source, self.outpath
)
Loading
Loading