diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index e5eec57..28753dc 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -26,8 +26,8 @@ jobs: test -d tests test -f tests/fixtures/input/test-wsi.tif test -f tests/fixtures/input/test-mask.tif - test -f tests/fixtures/gt/test-wsi.tiles.npz - test -f tests/fixtures/gt/test-wsi.tiles.meta.json + test -f tests/fixtures/gt/test-wsi.coordinates.npz + test -f tests/fixtures/gt/test-wsi.coordinates.meta.json - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/README.md b/README.md index 0e221ae..d6d63ba 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,8 @@ result = tile_slide( artifacts = save_tiling_result(result, output_dir=Path("output")) -print(artifacts.tiles_npz_path) # output/coordinates/slide-1.tiles.npz ; more info in docs/artifacts.md -print(artifacts.tiles_meta_path) # output/coordinates/slide-1.tiles.meta.json ; more info in docs/artifacts.md +print(artifacts.coordinates_npz_path) # output/tiles/slide-1.coordinates.npz ; more info in docs/artifacts.md +print(artifacts.coordinates_meta_path) # output/tiles/slide-1.coordinates.meta.json ; more info in docs/artifacts.md tiling_preview_path = write_tiling_preview( result=result, @@ -155,8 +155,8 @@ More CLI details: [docs/cli.md](docs/cli.md) `hs2p` writes explicit named artifacts rather than anonymous coordinate dumps. -- Tiling writes `coordinates/{sample_id}.tiles.npz` and `coordinates/{sample_id}.tiles.meta.json` -- Sampling writes the same pair under `coordinates//` +- Tiling writes `tiles/{sample_id}.coordinates.npz` and `tiles/{sample_id}.coordinates.meta.json` +- Sampling writes the same pair under `tiles//` - Batch runs also write `process_list.csv` - Saved coordinate arrays use a deterministic column-major order: numeric `x` first, then numeric `y` within each shared `x` diff --git a/docs/README.md b/docs/README.md index 3ec0a60..5b8ebf3 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,6 +5,6 @@ - [CLI guide](cli.md) - Input CSV schema, batch commands, and current config layout - [Artifact reference](artifacts.md) - - `.tiles.npz`, `.tiles.meta.json`, and `process_list.csv` + - `.coordinates.npz`, `.coordinates.meta.json`, and `process_list.csv` - [Tissue mask generation](tissue-mask-generation.md) - Standalone script for producing pyramidal tissue masks diff --git a/docs/api.md b/docs/api.md index 77a9e52..df3ae6a 100644 --- a/docs/api.md +++ b/docs/api.md @@ -28,7 +28,7 @@ The config dataclasses keep the main knobs explicit and fill secondary options f - `TilingResult` - In-memory tile coordinates plus read-level metadata for one slide - `TilingArtifacts` - - Paths to the saved `.tiles.npz` and `.tiles.meta.json` outputs + - Paths to the saved `.coordinates.npz` and `.coordinates.meta.json` outputs - Can also carry preview-image paths when batch QC is enabled in `tile_slides()` For field-by-field details, see the dataclass docstrings in [hs2p/api.py](../hs2p/api.py). @@ -185,8 +185,8 @@ You can round-trip a saved result later: from hs2p import load_tiling_result loaded = load_tiling_result( - tiles_npz_path=artifacts.tiles_npz_path, - tiles_meta_path=artifacts.tiles_meta_path, + coordinates_npz_path=artifacts.coordinates_npz_path, + coordinates_meta_path=artifacts.coordinates_meta_path, ) ``` diff --git a/docs/artifacts.md b/docs/artifacts.md index b662c55..35ab2d8 100644 --- a/docs/artifacts.md +++ b/docs/artifacts.md @@ -4,15 +4,15 @@ HS2P writes explicit named artifacts for both the Python API and the CLI. ## Coordinate artifact locations -- Tiling writes one pair per slide under `coordinates/` -- Sampling writes one pair per annotation under `coordinates//` +- Tiling writes one pair per slide under `tiles/` +- Sampling writes one pair per annotation under `tiles//` Each successful output produces: -- `{sample_id}.tiles.npz` -- `{sample_id}.tiles.meta.json` +- `{sample_id}.coordinates.npz` +- `{sample_id}.coordinates.meta.json` -## `.tiles.npz` +## `.coordinates.npz` - `tile_index` - Contiguous tile ids from `0` to `num_tiles - 1` @@ -25,7 +25,7 @@ Each successful output produces: - `tissue_fraction` - Optional per-tile tissue coverage measured during extraction -## `.tiles.meta.json` +## `.coordinates.meta.json` - `sample_id` - Sample identifier that produced the artifact and names the files @@ -69,8 +69,8 @@ Tiling columns: - `mask_path` - `tiling_status` - `num_tiles` -- `tiles_npz_path` -- `tiles_meta_path` +- `coordinates_npz_path` +- `coordinates_meta_path` - `error` - `traceback` diff --git a/docs/cli.md b/docs/cli.md index 230c5cc..8961dc8 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -71,8 +71,8 @@ Detailed logs still go to `output_dir/logs/log.txt`, which is the best place to ## Current config areas -- `tiling.read_tiles_from` - - Optional directory containing precomputed `{sample_id}.tiles.npz` and `{sample_id}.tiles.meta.json` +- `tiling.read_coordinates_from` + - Optional directory containing precomputed `{sample_id}.coordinates.npz` and `{sample_id}.coordinates.meta.json` - `tiling.params` - Core tiling resolution, tile size, overlap, and tissue-threshold settings - `tiling.seg_params` @@ -124,6 +124,6 @@ When enabled, every candidate tile that passes the tissue mask check is read fro - `resume: true` expects the current `process_list.csv` schema and current-format artifacts - reused tiling artifacts are validated against `sample_id`, `config_hash`, `image_path`, and `mask_path` -- `tiling.read_tiles_from` is the supported way to reuse precomputed tiling outputs +- `tiling.read_coordinates_from` is the supported way to reuse precomputed tiling outputs For the exact output files and field meanings, see [artifacts.md](artifacts.md). diff --git a/hs2p/api.py b/hs2p/api.py index 0aa6e0f..31506b6 100644 --- a/hs2p/api.py +++ b/hs2p/api.py @@ -1,11 +1,13 @@ import hashlib +import io import json import multiprocessing as mp +import tarfile import tempfile import traceback import warnings from concurrent.futures import ThreadPoolExecutor -from dataclasses import asdict, dataclass, is_dataclass +from dataclasses import asdict, dataclass, is_dataclass, replace from pathlib import Path from typing import Any, Sequence @@ -107,17 +109,18 @@ class TilingArtifacts: Attributes: sample_id: Sample identifier that names the artifact files. - tiles_npz_path: Path to the saved ``.tiles.npz`` coordinate artifact. - tiles_meta_path: Path to the saved ``.tiles.meta.json`` metadata artifact. + coordinates_npz_path: Path to the saved ``.coordinates.npz`` coordinate artifact. + coordinates_meta_path: Path to the saved ``.coordinates.meta.json`` metadata artifact. num_tiles: Number of tiles stored in the artifact pair. mask_preview_path: Optional path to the saved mask preview image. tiling_preview_path: Optional path to the saved tiling preview image. """ sample_id: str - tiles_npz_path: Path - tiles_meta_path: Path + coordinates_npz_path: Path + coordinates_meta_path: Path num_tiles: int + tiles_tar_path: Path | None = None mask_preview_path: Path | None = None tiling_preview_path: Path | None = None @@ -387,17 +390,17 @@ def save_tiling_result( result: TilingResult, output_dir: Path, *, - coordinates_dir: Path | None = None, + tiles_dir: Path | None = None, ) -> TilingArtifacts: _validate_result_consistency(result) - coordinates_dir = ( - Path(coordinates_dir) - if coordinates_dir is not None - else Path(output_dir) / "coordinates" + tiles_dir = ( + Path(tiles_dir) + if tiles_dir is not None + else Path(output_dir) / "tiles" ) - coordinates_dir.mkdir(parents=True, exist_ok=True) - npz_path = coordinates_dir / f"{result.sample_id}.tiles.npz" - meta_path = coordinates_dir / f"{result.sample_id}.tiles.meta.json" + tiles_dir.mkdir(parents=True, exist_ok=True) + npz_path = tiles_dir / f"{result.sample_id}.coordinates.npz" + meta_path = tiles_dir / f"{result.sample_id}.coordinates.meta.json" payload = { "tile_index": result.tile_index.astype(np.int32, copy=False), @@ -439,7 +442,7 @@ def save_tiling_result( with tempfile.NamedTemporaryFile( mode="wb", suffix=".npz", - dir=coordinates_dir, + dir=tiles_dir, delete=False, ) as handle: temp_npz_path = Path(handle.name) @@ -449,7 +452,7 @@ def save_tiling_result( with tempfile.NamedTemporaryFile( mode="w", suffix=".json", - dir=coordinates_dir, + dir=tiles_dir, delete=False, ) as handle: temp_meta_path = Path(handle.name) @@ -471,33 +474,158 @@ def save_tiling_result( temp_meta_path.unlink(missing_ok=True) return TilingArtifacts( sample_id=result.sample_id, - tiles_npz_path=npz_path, - tiles_meta_path=meta_path, + coordinates_npz_path=npz_path, + coordinates_meta_path=meta_path, num_tiles=result.num_tiles, ) +def extract_tiles_to_tar( + result: TilingResult, + output_dir: Path, + *, + jpeg_quality: int = 90, + tiles_dir: Path | None = None, + filter_params: FilterConfig | None = None, +) -> tuple[Path, TilingResult]: + """Extract tile images from a WSI and save them as a JPEG tar archive. + + When *filter_params* requests white/black filtering the tiles are checked + during extraction so that pixel data is read only once. The returned + ``TilingResult`` has its coordinate arrays trimmed to the surviving tiles. + """ + import wholeslidedata as wsd + from PIL import Image + + tiles_dir = Path(tiles_dir) if tiles_dir is not None else Path(output_dir) / "tiles" + tiles_dir.mkdir(parents=True, exist_ok=True) + tar_path = tiles_dir / f"{result.sample_id}.tiles.tar" + + wsi = wsd.WholeSlideImage(result.image_path, backend=result.backend) + + do_filter_white = filter_params is not None and filter_params.filter_white + do_filter_black = filter_params is not None and filter_params.filter_black + white_thresh = getattr(filter_params, "white_threshold", 220) if filter_params else 220 + black_thresh = getattr(filter_params, "black_threshold", 25) if filter_params else 25 + frac_thresh = getattr(filter_params, "fraction_threshold", 0.9) if filter_params else 0.9 + + kept_indices: list[int] = [] + tile_counter = 0 + + temp_tar_path: Path | None = None + try: + with tempfile.NamedTemporaryFile( + suffix=".tar", dir=tiles_dir, delete=False + ) as tmp: + temp_tar_path = Path(tmp.name) + + with tarfile.open(temp_tar_path, "w") as tf: + for i in range(result.num_tiles): + tile_arr = wsi.get_patch( + int(result.x[i]), + int(result.y[i]), + int(result.read_tile_size_px), + int(result.read_tile_size_px), + spacing=float(result.read_spacing_um), + center=False, + ) + if tile_arr.shape[2] > 3: + tile_arr = tile_arr[:, :, :3] + + if do_filter_white or do_filter_black: + total_pixels = tile_arr.shape[0] * tile_arr.shape[1] + if do_filter_white: + white_frac = np.all(tile_arr > white_thresh, axis=-1).sum() / total_pixels + if white_frac > frac_thresh: + continue + if do_filter_black: + black_frac = np.all(tile_arr < black_thresh, axis=-1).sum() / total_pixels + if black_frac > frac_thresh: + continue + + img = Image.fromarray(tile_arr).convert("RGB") + if result.read_tile_size_px != result.target_tile_size_px: + img = img.resize( + (result.target_tile_size_px, result.target_tile_size_px), + Image.LANCZOS, + ) + + buf = io.BytesIO() + img.save(buf, format="JPEG", quality=jpeg_quality) + buf.seek(0) + + info = tarfile.TarInfo(name=f"{tile_counter:06d}.jpg") + info.size = buf.getbuffer().nbytes + tf.addfile(info, buf) + + kept_indices.append(i) + tile_counter += 1 + + temp_tar_path.replace(tar_path) + temp_tar_path = None + finally: + if temp_tar_path is not None: + temp_tar_path.unlink(missing_ok=True) + + if len(kept_indices) == result.num_tiles: + return tar_path, result + + kept = np.asarray(kept_indices, dtype=np.int64) + filtered_result = TilingResult( + sample_id=result.sample_id, + image_path=result.image_path, + mask_path=result.mask_path, + backend=result.backend, + x=result.x[kept], + y=result.y[kept], + tile_index=np.arange(len(kept), dtype=np.int32), + target_spacing_um=result.target_spacing_um, + target_tile_size_px=result.target_tile_size_px, + read_level=result.read_level, + read_spacing_um=result.read_spacing_um, + read_tile_size_px=result.read_tile_size_px, + tile_size_lv0=result.tile_size_lv0, + overlap=result.overlap, + tissue_threshold=result.tissue_threshold, + num_tiles=len(kept), + config_hash=result.config_hash, + tissue_fraction=( + result.tissue_fraction[kept] + if result.tissue_fraction is not None + else None + ), + annotation=result.annotation, + selection_strategy=result.selection_strategy, + output_mode=result.output_mode, + ) + return tar_path, filtered_result + + +def _needs_pixel_filtering(filtering: FilterConfig) -> bool: + return bool(filtering.filter_white or filtering.filter_black) + + def load_tiling_result( - tiles_npz_path: Path, - tiles_meta_path: Path, + coordinates_npz_path: Path, + coordinates_meta_path: Path, ) -> TilingResult: try: - tiles = np.load(tiles_npz_path, allow_pickle=False) + tiles = np.load(coordinates_npz_path, allow_pickle=False) except Exception as exc: raise ValueError( - f"Unable to load tiling npz artifact {tiles_npz_path}: {exc}" + f"Unable to load tiling npz artifact {coordinates_npz_path}: {exc}" ) from exc try: - meta = json.loads(Path(tiles_meta_path).read_text()) + meta = json.loads(Path(coordinates_meta_path).read_text()) except Exception as exc: raise ValueError( - f"Unable to load tiling metadata artifact {tiles_meta_path}: {exc}" + f"Unable to load tiling metadata artifact {coordinates_meta_path}: {exc}" ) from exc required_npz_keys = {"tile_index", "x", "y"} missing_npz_keys = sorted(required_npz_keys - set(tiles.files)) if missing_npz_keys: raise ValueError( - f"Invalid tiling npz artifact {tiles_npz_path}; missing keys: " + f"Invalid tiling npz artifact {coordinates_npz_path}; missing keys: " + ", ".join(missing_npz_keys) ) required_meta_keys = { @@ -519,7 +647,7 @@ def load_tiling_result( missing_meta_keys = sorted(required_meta_keys - set(meta)) if missing_meta_keys: raise ValueError( - f"Invalid tiling metadata artifact {tiles_meta_path}; missing keys: " + f"Invalid tiling metadata artifact {coordinates_meta_path}; missing keys: " + ", ".join(missing_meta_keys) ) x = tiles["x"].astype(np.int64, copy=False) @@ -566,12 +694,12 @@ def load_tiling_result( def validate_tiling_artifacts( *, whole_slide: SlideSpec, - tiles_npz_path: Path, - tiles_meta_path: Path, + coordinates_npz_path: Path, + coordinates_meta_path: Path, expected_config_hash: str, ) -> TilingArtifacts: result = load_tiling_result( - tiles_npz_path=tiles_npz_path, tiles_meta_path=tiles_meta_path + coordinates_npz_path=coordinates_npz_path, coordinates_meta_path=coordinates_meta_path ) if result.sample_id != whole_slide.sample_id: raise ValueError( @@ -595,8 +723,8 @@ def validate_tiling_artifacts( ) return TilingArtifacts( sample_id=result.sample_id, - tiles_npz_path=tiles_npz_path, - tiles_meta_path=tiles_meta_path, + coordinates_npz_path=coordinates_npz_path, + coordinates_meta_path=coordinates_meta_path, num_tiles=result.num_tiles, ) @@ -618,21 +746,21 @@ def _validate_whole_slides(whole_slides: Sequence[SlideSpec]) -> None: def _maybe_load_existing_artifacts( *, whole_slide: SlideSpec, - read_tiles_from: Path, + read_coordinates_from: Path, expected_config_hash: str, ) -> TilingArtifacts | None: - npz_path = read_tiles_from / f"{whole_slide.sample_id}.tiles.npz" - meta_path = read_tiles_from / f"{whole_slide.sample_id}.tiles.meta.json" + npz_path = read_coordinates_from / f"{whole_slide.sample_id}.coordinates.npz" + meta_path = read_coordinates_from / f"{whole_slide.sample_id}.coordinates.meta.json" if not npz_path.is_file() and not meta_path.is_file(): return None if not npz_path.is_file() or not meta_path.is_file(): raise ValueError( - f"Missing tiling sidecar for sample_id={whole_slide.sample_id} in {read_tiles_from}" + f"Missing tiling sidecar for sample_id={whole_slide.sample_id} in {read_coordinates_from}" ) return validate_tiling_artifacts( whole_slide=whole_slide, - tiles_npz_path=npz_path, - tiles_meta_path=meta_path, + coordinates_npz_path=npz_path, + coordinates_meta_path=meta_path, expected_config_hash=expected_config_hash, ) @@ -751,6 +879,7 @@ class _SlideComputeRequest: output_dir: Path num_workers: int include_result: bool = False + save_tiles: bool = False @dataclass(frozen=True) @@ -773,9 +902,10 @@ def _build_success_artifact( ) -> TilingArtifacts: return TilingArtifacts( sample_id=base_artifact.sample_id, - tiles_npz_path=base_artifact.tiles_npz_path, - tiles_meta_path=base_artifact.tiles_meta_path, + coordinates_npz_path=base_artifact.coordinates_npz_path, + coordinates_meta_path=base_artifact.coordinates_meta_path, num_tiles=base_artifact.num_tiles, + tiles_tar_path=base_artifact.tiles_tar_path, mask_preview_path=mask_preview_path, tiling_preview_path=tiling_preview_path, ) @@ -794,8 +924,9 @@ def _build_success_process_row( ), "tiling_status": "success", "num_tiles": artifact.num_tiles, - "tiles_npz_path": str(artifact.tiles_npz_path), - "tiles_meta_path": str(artifact.tiles_meta_path), + "coordinates_npz_path": str(artifact.coordinates_npz_path), + "coordinates_meta_path": str(artifact.coordinates_meta_path), + "tiles_tar_path": str(artifact.tiles_tar_path) if artifact.tiles_tar_path is not None else np.nan, "error": np.nan, "traceback": np.nan, } @@ -815,8 +946,9 @@ def _build_failure_process_row( ), "tiling_status": "failed", "num_tiles": 0, - "tiles_npz_path": np.nan, - "tiles_meta_path": np.nan, + "coordinates_npz_path": np.nan, + "coordinates_meta_path": np.nan, + "tiles_tar_path": np.nan, "error": error, "traceback": traceback_text, } @@ -847,16 +979,36 @@ def _compute_tiling_result_from_request( request: _SlideComputeRequest, ) -> _SlideComputeResponse: try: + defer_pixel_filtering = request.save_tiles and _needs_pixel_filtering(request.filtering) + effective_filtering = ( + replace(request.filtering, filter_white=False, filter_black=False) + if defer_pixel_filtering + else request.filtering + ) result = _compute_tiling_result( request.whole_slide, tiling=request.tiling, segmentation=request.segmentation, - filtering=request.filtering, + filtering=effective_filtering, mask_preview_path=request.mask_preview_path, num_workers=request.num_workers, config_hash=request.config_hash, ) + tiles_tar_path: Path | None = None + if request.save_tiles: + tiles_tar_path, result = extract_tiles_to_tar( + result, + output_dir=request.output_dir, + filter_params=request.filtering if _needs_pixel_filtering(request.filtering) else None, + ) artifact = save_tiling_result(result, output_dir=request.output_dir) + artifact = TilingArtifacts( + sample_id=artifact.sample_id, + coordinates_npz_path=artifact.coordinates_npz_path, + coordinates_meta_path=artifact.coordinates_meta_path, + num_tiles=artifact.num_tiles, + tiles_tar_path=tiles_tar_path, + ) mask_preview_path = ( request.mask_preview_path if request.mask_preview_path is not None @@ -903,7 +1055,7 @@ def _write_tiling_preview_from_artifacts( output_dir: Path, downsample: int, ) -> Path | None: - result = load_tiling_result(artifact.tiles_npz_path, artifact.tiles_meta_path) + result = load_tiling_result(artifact.coordinates_npz_path, artifact.coordinates_meta_path) return write_tiling_preview( result=result, output_dir=output_dir, @@ -921,7 +1073,8 @@ def tile_slides( output_dir: Path, num_workers: int = 1, resume: bool = False, - read_tiles_from: Path | None = None, + read_coordinates_from: Path | None = None, + save_tiles: bool = False, ) -> list[TilingArtifacts]: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -940,8 +1093,8 @@ def tile_slides( "mask_path", "tiling_status", "num_tiles", - "tiles_npz_path", - "tiles_meta_path", + "coordinates_npz_path", + "coordinates_meta_path", "error", "traceback", }, @@ -983,18 +1136,18 @@ def _expected_hash_for_slide(whole_slide: SlideSpec) -> str: artifact: TilingArtifacts | None = None if whole_slide.sample_id in existing_successes: row = existing_successes[whole_slide.sample_id] - npz_path = Path(str(row["tiles_npz_path"])) - meta_path = Path(str(row["tiles_meta_path"])) + npz_path = Path(str(row["coordinates_npz_path"])) + meta_path = Path(str(row["coordinates_meta_path"])) artifact = validate_tiling_artifacts( whole_slide=whole_slide, - tiles_npz_path=npz_path, - tiles_meta_path=meta_path, + coordinates_npz_path=npz_path, + coordinates_meta_path=meta_path, expected_config_hash=expected_hash, ) - if read_tiles_from is not None and artifact is None: + if read_coordinates_from is not None and artifact is None: artifact = _maybe_load_existing_artifacts( whole_slide=whole_slide, - read_tiles_from=Path(read_tiles_from), + read_coordinates_from=Path(read_coordinates_from), expected_config_hash=expected_hash, ) if artifact is not None: @@ -1019,6 +1172,7 @@ def _expected_hash_for_slide(whole_slide: SlideSpec) -> str: mask_preview_path=mask_preview_path, output_dir=output_dir, num_workers=1, + save_tiles=save_tiles, ) planned_work.append( _PlannedSlideWork( @@ -1222,6 +1376,7 @@ def _await_response(input_index: int) -> _SlideComputeResponse: output_dir=request.output_dir, num_workers=worker_inner_workers, include_result=False, + save_tiles=request.save_tiles, ) for request in compute_requests ] @@ -1247,6 +1402,7 @@ def _await_response(input_index: int) -> _SlideComputeResponse: output_dir=request.output_dir, num_workers=worker_inner_workers, include_result=True, + save_tiles=request.save_tiles, ) for request in compute_requests ] diff --git a/hs2p/configs/default.yaml b/hs2p/configs/default.yaml index 4542342..fd9c620 100644 --- a/hs2p/configs/default.yaml +++ b/hs2p/configs/default.yaml @@ -9,7 +9,7 @@ save_previews: true # save preview images of slide tiling and mask overlays seed: 0 # seed for reproducibility tiling: - read_tiles_from: # path to a directory containing {sample_id}.tiles.npz and {sample_id}.tiles.meta.json (leave empty to compute tiles) + read_coordinates_from: # path to a directory containing {sample_id}.coordinates.npz and {sample_id}.coordinates.meta.json (leave empty to compute tiles) backend: "asap" # backend to use for slide reading params: target_spacing_um: 0.5 # spacing at which to tile the slide, in microns per pixel diff --git a/hs2p/configs/resolvers.py b/hs2p/configs/resolvers.py index 99ee5af..37a68a5 100644 --- a/hs2p/configs/resolvers.py +++ b/hs2p/configs/resolvers.py @@ -37,8 +37,8 @@ def resolve_preview_config(cfg: Any) -> PreviewConfig: ) -def resolve_read_tiles_from(cfg: Any) -> Path | None: - value = cfg.tiling.read_tiles_from +def resolve_read_coordinates_from(cfg: Any) -> Path | None: + value = cfg.tiling.read_coordinates_from if not value: return None return Path(value) diff --git a/hs2p/sampling.py b/hs2p/sampling.py index 47f2e21..a16ccb7 100644 --- a/hs2p/sampling.py +++ b/hs2p/sampling.py @@ -182,10 +182,10 @@ def _save_sampling_coordinates( selection_strategy=selection_strategy, output_mode=output_mode, ) - annotation_dir = Path(cfg.output_dir, "coordinates", annotation) + annotation_dir = Path(cfg.output_dir, "tiles", annotation) annotation_dir.mkdir(parents=True, exist_ok=True) return save_tiling_result( - result, output_dir=cfg.output_dir, coordinates_dir=annotation_dir + result, output_dir=cfg.output_dir, tiles_dir=annotation_dir ) @@ -307,11 +307,11 @@ def process_slide( "mask_path": str(mask_path) if mask_path is not None else None, "sampling_status": "success", "num_tiles": len(coordinates), - "tiles_npz_path": ( - str(artifacts.tiles_npz_path) if artifacts is not None else np.nan + "coordinates_npz_path": ( + str(artifacts.coordinates_npz_path) if artifacts is not None else np.nan ), - "tiles_meta_path": ( - str(artifacts.tiles_meta_path) + "coordinates_meta_path": ( + str(artifacts.coordinates_meta_path) if artifacts is not None else np.nan ), @@ -338,8 +338,8 @@ def process_slide( "mask_path": str(mask_path) if mask_path is not None else None, "sampling_status": "failed", "num_tiles": 0, - "tiles_npz_path": np.nan, - "tiles_meta_path": np.nan, + "coordinates_npz_path": np.nan, + "coordinates_meta_path": np.nan, "config_hash": np.nan, "error": str(e), "traceback": str(traceback.format_exc()), @@ -366,8 +366,8 @@ def _build_sampling_process_rows( ), "sampling_status": "tbp", "num_tiles": 0, - "tiles_npz_path": np.nan, - "tiles_meta_path": np.nan, + "coordinates_npz_path": np.nan, + "coordinates_meta_path": np.nan, "config_hash": np.nan, "error": np.nan, "traceback": np.nan, @@ -390,8 +390,8 @@ def _validate_sampling_artifact_row( num_tiles = int(row.get("num_tiles", 0)) if num_tiles == 0: return - npz_path = Path(str(row["tiles_npz_path"])) - meta_path = Path(str(row["tiles_meta_path"])) + npz_path = Path(str(row["coordinates_npz_path"])) + meta_path = Path(str(row["coordinates_meta_path"])) result = load_tiling_result(npz_path, meta_path) if result.sample_id != whole_slide.sample_id: raise ValueError("sampling sample_id mismatch") @@ -435,7 +435,7 @@ def main(args): output_dir=str(output_dir), num_workers=int(cfg.speed.num_workers), resume=bool(cfg.resume), - read_tiles_from=None, + read_coordinates_from=None, ) process_list = output_dir / "process_list.csv" @@ -450,8 +450,8 @@ def main(args): "mask_path", "sampling_status", "num_tiles", - "tiles_npz_path", - "tiles_meta_path", + "coordinates_npz_path", + "coordinates_meta_path", "config_hash", "error", "traceback", @@ -475,8 +475,8 @@ def main(args): "mask_path", "sampling_status", "num_tiles", - "tiles_npz_path", - "tiles_meta_path", + "coordinates_npz_path", + "coordinates_meta_path", "config_hash", "error", "traceback", @@ -523,8 +523,8 @@ def main(args): cfg, slide_count=total ) - coordinates_dir = output_dir / "coordinates" - coordinates_dir.mkdir(exist_ok=True, parents=True) + tiles_dir = output_dir / "tiles" + tiles_dir.mkdir(exist_ok=True, parents=True) mask_preview_dir = None sampling_preview_dir = None if cfg.save_previews: diff --git a/hs2p/tiling.py b/hs2p/tiling.py index 7a77502..4774097 100644 --- a/hs2p/tiling.py +++ b/hs2p/tiling.py @@ -7,7 +7,7 @@ from hs2p.configs.resolvers import ( resolve_filter_config, resolve_preview_config, - resolve_read_tiles_from, + resolve_read_coordinates_from, resolve_segmentation_config, resolve_tiling_config, ) @@ -53,7 +53,7 @@ def main(args): segmentation = resolve_segmentation_config(cfg) filtering = resolve_filter_config(cfg) preview = resolve_preview_config(cfg) - read_tiles_from = resolve_read_tiles_from(cfg) + read_coordinates_from = resolve_read_coordinates_from(cfg) progress.emit_progress( "run.started", command="tiling", @@ -64,7 +64,9 @@ def main(args): output_dir=str(output_dir), num_workers=int(cfg.speed.num_workers), resume=bool(cfg.resume), - read_tiles_from=str(read_tiles_from) if read_tiles_from else None, + read_coordinates_from=( + str(read_coordinates_from) if read_coordinates_from else None + ), ) artifacts = tile_slides( whole_slides, @@ -75,7 +77,7 @@ def main(args): output_dir=output_dir, num_workers=cfg.speed.num_workers, resume=cfg.resume, - read_tiles_from=read_tiles_from, + read_coordinates_from=read_coordinates_from, ) pd.read_csv(output_dir / "process_list.csv") progress.emit_progress( diff --git a/scripts/benchmark_throughput.py b/scripts/benchmark_throughput.py index 481b31f..f825e14 100644 --- a/scripts/benchmark_throughput.py +++ b/scripts/benchmark_throughput.py @@ -359,7 +359,7 @@ def write_config( "save_previews": False, "seed": 0, "tiling": { - "read_tiles_from": None, + "read_coordinates_from": None, "backend": backend, "params": { "target_spacing_um": target_spacing, diff --git a/tests/fixtures/gt/test-wsi.tiles.meta.json b/tests/fixtures/gt/test-wsi.coordinates.meta.json similarity index 100% rename from tests/fixtures/gt/test-wsi.tiles.meta.json rename to tests/fixtures/gt/test-wsi.coordinates.meta.json diff --git a/tests/fixtures/gt/test-wsi.tiles.npz b/tests/fixtures/gt/test-wsi.coordinates.npz similarity index 100% rename from tests/fixtures/gt/test-wsi.tiles.npz rename to tests/fixtures/gt/test-wsi.coordinates.npz diff --git a/tests/test_cli_smoke.py b/tests/test_cli_smoke.py index 20b5adb..f947528 100644 --- a/tests/test_cli_smoke.py +++ b/tests/test_cli_smoke.py @@ -40,7 +40,7 @@ def _base_cfg(tmp_path: Path, csv_path: Path) -> SimpleNamespace: save_previews=False, speed=SimpleNamespace(num_workers=1), tiling=SimpleNamespace( - read_tiles_from=None, + read_coordinates_from=None, backend="asap", params=SimpleNamespace( target_spacing_um=0.5, @@ -101,9 +101,9 @@ def _fake_tile_slides( output_dir, num_workers, resume, - read_tiles_from, + read_coordinates_from, ): - del tiling, segmentation, filtering, preview, num_workers, resume, read_tiles_from + del tiling, segmentation, filtering, preview, num_workers, resume, read_coordinates_from captured["whole_slides"] = whole_slides output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -115,12 +115,13 @@ def _fake_tile_slides( "mask_path": "slide-1-mask.png", "tiling_status": "success", "num_tiles": 2, - "tiles_npz_path": str( - output_dir / "coordinates" / "slide-1.tiles.npz" + "coordinates_npz_path": str( + output_dir / "tiles" / "slide-1.coordinates.npz" ), - "tiles_meta_path": str( - output_dir / "coordinates" / "slide-1.tiles.meta.json" + "coordinates_meta_path": str( + output_dir / "tiles" / "slide-1.coordinates.meta.json" ), + "tiles_tar_path": np.nan, "error": np.nan, "traceback": np.nan, } @@ -130,8 +131,8 @@ def _fake_tile_slides( return [ TilingArtifacts( sample_id="slide-1", - tiles_npz_path=output_dir / "coordinates" / "slide-1.tiles.npz", - tiles_meta_path=output_dir / "coordinates" / "slide-1.tiles.meta.json", + coordinates_npz_path=output_dir / "tiles" / "slide-1.coordinates.npz", + coordinates_meta_path=output_dir / "tiles" / "slide-1.coordinates.meta.json", num_tiles=2, ) ] @@ -154,8 +155,9 @@ def _fake_tile_slides( "mask_path", "tiling_status", "num_tiles", - "tiles_npz_path", - "tiles_meta_path", + "coordinates_npz_path", + "coordinates_meta_path", + "tiles_tar_path", "error", "traceback", ] @@ -195,15 +197,15 @@ def imap(self, fn, args_list): monkeypatch.setattr(sampling_mod.mp, "Pool", _FakePool) def _fake_process_slide_wrapper(kwargs): - annotation_dir = Path(kwargs["cfg"].output_dir) / "coordinates" / "tumor" + annotation_dir = Path(kwargs["cfg"].output_dir) / "tiles" / "tumor" annotation_dir.mkdir(parents=True, exist_ok=True) np.savez( - annotation_dir / f"{kwargs['sample_id']}.tiles.npz", + annotation_dir / f"{kwargs['sample_id']}.coordinates.npz", tile_index=np.array([0], dtype=np.int32), x=np.array([10], dtype=np.int64), y=np.array([20], dtype=np.int64), ) - meta_path = annotation_dir / f"{kwargs['sample_id']}.tiles.meta.json" + meta_path = annotation_dir / f"{kwargs['sample_id']}.coordinates.meta.json" meta_path.write_text( '{"sample_id":"slide-1","image_path":"slide-1.svs","mask_path":"slide-1-mask.png","backend":"asap","target_spacing_um":0.5,"target_tile_size_px":256,"read_level":0,"read_spacing_um":0.5,"read_tile_size_px":256,"tile_size_lv0":256,"overlap":0.0,"tissue_threshold":0.1,"num_tiles":1,"config_hash":"hash","annotation":"tumor","selection_strategy":"independent_sampling","output_mode":"per_annotation"}\n' ) @@ -217,8 +219,8 @@ def _fake_process_slide_wrapper(kwargs): "mask_path": "slide-1-mask.png", "sampling_status": "success", "num_tiles": 1, - "tiles_npz_path": str(annotation_dir / f"{kwargs['sample_id']}.tiles.npz"), - "tiles_meta_path": str(meta_path), + "coordinates_npz_path": str(annotation_dir / f"{kwargs['sample_id']}.coordinates.npz"), + "coordinates_meta_path": str(meta_path), "config_hash": "hash", "error": np.nan, "traceback": np.nan, @@ -240,8 +242,8 @@ def _fake_process_slide_wrapper(kwargs): "mask_path", "sampling_status", "num_tiles", - "tiles_npz_path", - "tiles_meta_path", + "coordinates_npz_path", + "coordinates_meta_path", "config_hash", "error", "traceback", @@ -253,11 +255,11 @@ def _fake_process_slide_wrapper(kwargs): assert row["mask_path"] == "slide-1-mask.png" assert row["sampling_status"] == "success" assert row["num_tiles"] == 1 - assert row["tiles_npz_path"].endswith("coordinates/tumor/slide-1.tiles.npz") - assert row["tiles_meta_path"].endswith("coordinates/tumor/slide-1.tiles.meta.json") + assert row["coordinates_npz_path"].endswith("tiles/tumor/slide-1.coordinates.npz") + assert row["coordinates_meta_path"].endswith("tiles/tumor/slide-1.coordinates.meta.json") assert row["config_hash"] == "hash" assert pd.isna(row["error"]) assert pd.isna(row["traceback"]) assert ( - Path(cfg.output_dir) / "coordinates" / "tumor" / "slide-1.tiles.npz" + Path(cfg.output_dir) / "tiles" / "tumor" / "slide-1.coordinates.npz" ).is_file() diff --git a/tests/test_fixture_artifacts_regression.py b/tests/test_fixture_artifacts_regression.py index f2cb7f6..2c1f662 100644 --- a/tests/test_fixture_artifacts_regression.py +++ b/tests/test_fixture_artifacts_regression.py @@ -83,16 +83,16 @@ def _run_and_save_tiles( num_workers=1, ) artifacts = save_tiling_result(result, output_dir=output_dir) - generated = np.load(artifacts.tiles_npz_path, allow_pickle=False) - meta = json.loads(artifacts.tiles_meta_path.read_text()) + generated = np.load(artifacts.coordinates_npz_path, allow_pickle=False) + meta = json.loads(artifacts.coordinates_meta_path.read_text()) return result, artifacts, generated, meta def test_generated_tiles_match_checked_in_artifacts(real_fixture_paths, tmp_path: Path): wsi_path, mask_path = real_fixture_paths gt_dir = wsi_path.parent.parent / "gt" - gt_npz_path = gt_dir / "test-wsi.tiles.npz" - gt_meta_path = gt_dir / "test-wsi.tiles.meta.json" + gt_npz_path = gt_dir / "test-wsi.coordinates.npz" + gt_meta_path = gt_dir / "test-wsi.coordinates.meta.json" assert gt_npz_path.is_file(), f"Missing golden coordinates: {gt_npz_path}" assert gt_meta_path.is_file(), f"Missing golden metadata: {gt_meta_path}" diff --git a/tests/test_progress.py b/tests/test_progress.py index d474a05..f780c6e 100644 --- a/tests/test_progress.py +++ b/tests/test_progress.py @@ -105,7 +105,7 @@ def _base_cli_cfg(tmp_path: Path, *, resume: bool = False) -> SimpleNamespace: speed=SimpleNamespace(num_workers=1), tiling=SimpleNamespace( backend="asap", - read_tiles_from=None, + read_coordinates_from=None, params=SimpleNamespace( target_spacing_um=0.5, target_tile_size_px=256, @@ -163,7 +163,7 @@ def test_tiling_main_installs_progress_reporter_only_during_pipeline_run( ) monkeypatch.setattr(tiling_mod, "resolve_filter_config", lambda cfg: _filter_config()) monkeypatch.setattr(tiling_mod, "resolve_preview_config", lambda cfg: None) - monkeypatch.setattr(tiling_mod, "resolve_read_tiles_from", lambda cfg: None) + monkeypatch.setattr(tiling_mod, "resolve_read_coordinates_from", lambda cfg: None) monkeypatch.setattr(progress, "create_cli_progress_reporter", lambda **kwargs: reporter) def _fake_tile_slides(*args, **kwargs): @@ -178,9 +178,9 @@ def _fake_tile_slides(*args, **kwargs): "mask_path": np.nan, "tiling_status": "success", "num_tiles": 2, - "tiles_npz_path": str(output_dir / "coordinates" / "slide-1.tiles.npz"), - "tiles_meta_path": str( - output_dir / "coordinates" / "slide-1.tiles.meta.json" + "coordinates_npz_path": str(output_dir / "tiles" / "slide-1.coordinates.npz"), + "coordinates_meta_path": str( + output_dir / "tiles" / "slide-1.coordinates.meta.json" ), "error": np.nan, "traceback": np.nan, @@ -190,8 +190,8 @@ def _fake_tile_slides(*args, **kwargs): return [ TilingArtifacts( sample_id="slide-1", - tiles_npz_path=output_dir / "coordinates" / "slide-1.tiles.npz", - tiles_meta_path=output_dir / "coordinates" / "slide-1.tiles.meta.json", + coordinates_npz_path=output_dir / "tiles" / "slide-1.coordinates.npz", + coordinates_meta_path=output_dir / "tiles" / "slide-1.coordinates.meta.json", num_tiles=2, ) ] @@ -225,9 +225,9 @@ def test_tile_slides_emits_progress_for_reused_success_and_failure( "mask_path": np.nan, "tiling_status": "success", "num_tiles": 2, - "tiles_npz_path": str(run_dir / "coordinates" / "slide-a.tiles.npz"), - "tiles_meta_path": str( - run_dir / "coordinates" / "slide-a.tiles.meta.json" + "coordinates_npz_path": str(run_dir / "tiles" / "slide-a.coordinates.npz"), + "coordinates_meta_path": str( + run_dir / "tiles" / "slide-a.coordinates.meta.json" ), "error": np.nan, "traceback": np.nan, @@ -239,10 +239,10 @@ def _fake_validate_tiling_artifacts(**kwargs): whole_slide = kwargs["whole_slide"] return TilingArtifacts( sample_id=whole_slide.sample_id, - tiles_npz_path=run_dir / "coordinates" / f"{whole_slide.sample_id}.tiles.npz", - tiles_meta_path=run_dir - / "coordinates" - / f"{whole_slide.sample_id}.tiles.meta.json", + coordinates_npz_path=run_dir / "tiles" / f"{whole_slide.sample_id}.coordinates.npz", + coordinates_meta_path=run_dir + / "tiles" + / f"{whole_slide.sample_id}.coordinates.meta.json", num_tiles=2, ) @@ -254,8 +254,8 @@ def _fake_compute_and_save(request): ok=True, artifact=TilingArtifacts( sample_id="slide-b", - tiles_npz_path=run_dir / "coordinates" / "slide-b.tiles.npz", - tiles_meta_path=run_dir / "coordinates" / "slide-b.tiles.meta.json", + coordinates_npz_path=run_dir / "tiles" / "slide-b.coordinates.npz", + coordinates_meta_path=run_dir / "tiles" / "slide-b.coordinates.meta.json", num_tiles=1, ), mask_preview_path=None, @@ -396,17 +396,17 @@ def _fake_process_slide_wrapper(kwargs): "mask_path": None, "sampling_status": "success", "num_tiles": 3, - "tiles_npz_path": str( + "coordinates_npz_path": str( Path(kwargs["cfg"].output_dir) - / "coordinates" + / "tiles" / "tumor" - / f"{sample_id}.tiles.npz" + / f"{sample_id}.coordinates.npz" ), - "tiles_meta_path": str( + "coordinates_meta_path": str( Path(kwargs["cfg"].output_dir) - / "coordinates" + / "tiles" / "tumor" - / f"{sample_id}.tiles.meta.json" + / f"{sample_id}.coordinates.meta.json" ), "config_hash": "hash", "error": np.nan, @@ -422,8 +422,8 @@ def _fake_process_slide_wrapper(kwargs): "mask_path": None, "sampling_status": "failed", "num_tiles": 0, - "tiles_npz_path": np.nan, - "tiles_meta_path": np.nan, + "coordinates_npz_path": np.nan, + "coordinates_meta_path": np.nan, "config_hash": "hash", "error": "boom", "traceback": "traceback", @@ -496,8 +496,8 @@ def test_sampling_main_emits_finished_summary_when_resume_has_no_work( "mask_path": np.nan, "sampling_status": "success", "num_tiles": 0, - "tiles_npz_path": np.nan, - "tiles_meta_path": np.nan, + "coordinates_npz_path": np.nan, + "coordinates_meta_path": np.nan, "config_hash": "expected-hash", "error": np.nan, "traceback": np.nan, diff --git a/tests/test_sampling_process_slide.py b/tests/test_sampling_process_slide.py index 3c1c456..c7af272 100644 --- a/tests/test_sampling_process_slide.py +++ b/tests/test_sampling_process_slide.py @@ -596,8 +596,8 @@ def imap(self, fn, args_list): ), "sampling_status": "success", "num_tiles": 0, - "tiles_npz_path": np.nan, - "tiles_meta_path": np.nan, + "coordinates_npz_path": np.nan, + "coordinates_meta_path": np.nan, "config_hash": "hash", "error": np.nan, "traceback": np.nan, @@ -718,8 +718,8 @@ def imap(self, fn, args_list): ), "sampling_status": "success", "num_tiles": 0, - "tiles_npz_path": np.nan, - "tiles_meta_path": np.nan, + "coordinates_npz_path": np.nan, + "coordinates_meta_path": np.nan, "config_hash": "hash", "error": np.nan, "traceback": np.nan, @@ -745,15 +745,15 @@ def test_save_sampling_coordinates_uses_annotation_threshold_and_sampling_mode_i ), ) - def _fake_save_tiling_result(result, output_dir, coordinates_dir=None): + def _fake_save_tiling_result(result, output_dir, tiles_dir=None): captured["result"] = result captured["output_dir"] = output_dir - captured["coordinates_dir"] = coordinates_dir + captured["tiles_dir"] = tiles_dir return SimpleNamespace( sample_id=result.sample_id, - tiles_npz_path=Path(coordinates_dir) / f"{result.sample_id}.tiles.npz", - tiles_meta_path=Path(coordinates_dir) - / f"{result.sample_id}.tiles.meta.json", + coordinates_npz_path=Path(tiles_dir) / f"{result.sample_id}.coordinates.npz", + coordinates_meta_path=Path(tiles_dir) + / f"{result.sample_id}.coordinates.meta.json", num_tiles=result.num_tiles, ) @@ -826,7 +826,7 @@ def _fake_save_tiling_result(result, output_dir, coordinates_dir=None): output_mode=sampling_mod.CoordinateOutputMode.PER_ANNOTATION, annotation="tumor", ) - assert captured["coordinates_dir"] == tmp_path / "coordinates" / "tumor" + assert captured["tiles_dir"] == tmp_path / "tiles" / "tumor" def test_save_sampling_coordinates_writes_sampling_metadata_fields(tmp_path): @@ -892,7 +892,7 @@ def test_save_sampling_coordinates_writes_sampling_metadata_fields(tmp_path): resolved_sampling_spec=resolved_sampling_spec, ) - meta = json.loads(artifacts.tiles_meta_path.read_text()) + meta = json.loads(artifacts.coordinates_meta_path.read_text()) assert meta["annotation"] == "tumor" assert meta["selection_strategy"] == "joint_sampling" assert meta["output_mode"] == "per_annotation" diff --git a/tests/test_tile_extraction.py b/tests/test_tile_extraction.py new file mode 100644 index 0000000..0b2182c --- /dev/null +++ b/tests/test_tile_extraction.py @@ -0,0 +1,253 @@ +"""Tests for extract_tiles_to_tar() and the save_tiles pipeline option.""" + +import io +import tarfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import numpy as np +from PIL import Image + +from hs2p.api import TilingResult, extract_tiles_to_tar +from hs2p.configs.models import FilterConfig + + +def _make_tiling_result( + num_tiles: int = 3, + tile_size: int = 256, + sample_id: str = "slide-1", +) -> TilingResult: + return TilingResult( + sample_id=sample_id, + image_path=Path("/data/slide-1.svs"), + mask_path=None, + backend="openslide", + x=np.arange(num_tiles, dtype=np.int64) * tile_size, + y=np.zeros(num_tiles, dtype=np.int64), + tile_index=np.arange(num_tiles, dtype=np.int32), + target_spacing_um=0.5, + target_tile_size_px=tile_size, + read_level=0, + read_spacing_um=0.5, + read_tile_size_px=tile_size, + tile_size_lv0=tile_size, + overlap=0.0, + tissue_threshold=0.1, + num_tiles=num_tiles, + config_hash="abc123", + ) + + +def _solid_patch(color: tuple[int, int, int], size: int = 256) -> np.ndarray: + """Return an (size, size, 3) uint8 array filled with *color*.""" + arr = np.empty((size, size, 3), dtype=np.uint8) + arr[:] = color + return arr + + +class TestExtractTilesToTar: + """Unit tests for extract_tiles_to_tar().""" + + def test_creates_tar_with_correct_number_of_jpegs(self, tmp_path: Path): + result = _make_tiling_result(num_tiles=3) + colors = [(128, 0, 0), (0, 128, 0), (0, 0, 128)] + + mock_wsi = MagicMock() + mock_wsi.get_patch.side_effect = [_solid_patch(c) for c in colors] + + with patch("wholeslidedata.WholeSlideImage", return_value=mock_wsi): + tar_path, out_result = extract_tiles_to_tar(result, output_dir=tmp_path) + + assert tar_path == tmp_path / "tiles" / "slide-1.tiles.tar" + assert tar_path.is_file() + + with tarfile.open(tar_path, "r") as tf: + members = tf.getmembers() + assert [m.name for m in members] == [ + "000000.jpg", + "000001.jpg", + "000002.jpg", + ] + for m in members: + data = tf.extractfile(m).read() + img = Image.open(io.BytesIO(data)) + assert img.size == (256, 256) + + # No filtering — result unchanged + assert out_result is result + + def test_white_filtering_drops_white_tile(self, tmp_path: Path): + result = _make_tiling_result(num_tiles=3) + patches = [ + _solid_patch((128, 0, 0)), # keep + _solid_patch((250, 250, 250)), # all-white → drop + _solid_patch((0, 0, 128)), # keep + ] + + mock_wsi = MagicMock() + mock_wsi.get_patch.side_effect = patches + + filter_cfg = FilterConfig( + filter_white=True, + white_threshold=220, + fraction_threshold=0.9, + ) + + with patch("wholeslidedata.WholeSlideImage", return_value=mock_wsi): + tar_path, filtered = extract_tiles_to_tar( + result, output_dir=tmp_path, filter_params=filter_cfg + ) + + # Tar should have 2 tiles + with tarfile.open(tar_path, "r") as tf: + assert len(tf.getmembers()) == 2 + + # Result should reflect filtering + assert filtered.num_tiles == 2 + np.testing.assert_array_equal(filtered.x, [0, 512]) + np.testing.assert_array_equal(filtered.y, [0, 0]) + assert len(filtered.tile_index) == 2 + np.testing.assert_array_equal(filtered.tile_index, [0, 1]) + + def test_black_filtering_drops_black_tile(self, tmp_path: Path): + result = _make_tiling_result(num_tiles=2) + patches = [ + _solid_patch((5, 5, 5)), # all-black → drop + _solid_patch((128, 128, 0)), # keep + ] + + mock_wsi = MagicMock() + mock_wsi.get_patch.side_effect = patches + + filter_cfg = FilterConfig( + filter_black=True, + black_threshold=25, + fraction_threshold=0.9, + ) + + with patch("wholeslidedata.WholeSlideImage", return_value=mock_wsi): + tar_path, filtered = extract_tiles_to_tar( + result, output_dir=tmp_path, filter_params=filter_cfg + ) + + with tarfile.open(tar_path, "r") as tf: + assert len(tf.getmembers()) == 1 + + assert filtered.num_tiles == 1 + np.testing.assert_array_equal(filtered.x, [256]) + + def test_all_tiles_filtered_produces_empty_tar(self, tmp_path: Path): + result = _make_tiling_result(num_tiles=1) + mock_wsi = MagicMock() + mock_wsi.get_patch.return_value = _solid_patch((250, 250, 250)) + + filter_cfg = FilterConfig( + filter_white=True, + white_threshold=220, + fraction_threshold=0.9, + ) + + with patch("wholeslidedata.WholeSlideImage", return_value=mock_wsi): + tar_path, filtered = extract_tiles_to_tar( + result, output_dir=tmp_path, filter_params=filter_cfg + ) + + with tarfile.open(tar_path, "r") as tf: + assert len(tf.getmembers()) == 0 + + assert filtered.num_tiles == 0 + assert len(filtered.x) == 0 + + def test_resizes_when_read_and_target_sizes_differ(self, tmp_path: Path): + result = _make_tiling_result(num_tiles=1, tile_size=224) + # Override read_tile_size_px to be different from target + result = TilingResult( + **{ + **{f.name: getattr(result, f.name) for f in result.__dataclass_fields__.values()}, + "read_tile_size_px": 512, + "target_tile_size_px": 224, + } + ) + + mock_wsi = MagicMock() + mock_wsi.get_patch.return_value = _solid_patch((100, 100, 100), size=512) + + with patch("wholeslidedata.WholeSlideImage", return_value=mock_wsi): + tar_path, _ = extract_tiles_to_tar(result, output_dir=tmp_path) + + with tarfile.open(tar_path, "r") as tf: + data = tf.extractfile(tf.getmembers()[0]).read() + img = Image.open(io.BytesIO(data)) + assert img.size == (224, 224) + + def test_strips_alpha_channel(self, tmp_path: Path): + result = _make_tiling_result(num_tiles=1) + rgba = np.zeros((256, 256, 4), dtype=np.uint8) + rgba[:, :, :3] = 100 + rgba[:, :, 3] = 255 + + mock_wsi = MagicMock() + mock_wsi.get_patch.return_value = rgba + + with patch("wholeslidedata.WholeSlideImage", return_value=mock_wsi): + tar_path, _ = extract_tiles_to_tar(result, output_dir=tmp_path) + + with tarfile.open(tar_path, "r") as tf: + data = tf.extractfile(tf.getmembers()[0]).read() + img = Image.open(io.BytesIO(data)) + assert img.mode == "RGB" + + def test_custom_tiles_dir(self, tmp_path: Path): + result = _make_tiling_result(num_tiles=1) + custom_dir = tmp_path / "custom_output" + + mock_wsi = MagicMock() + mock_wsi.get_patch.return_value = _solid_patch((50, 50, 50)) + + with patch("wholeslidedata.WholeSlideImage", return_value=mock_wsi): + tar_path, _ = extract_tiles_to_tar( + result, output_dir=tmp_path, tiles_dir=custom_dir + ) + + assert tar_path == custom_dir / "slide-1.tiles.tar" + assert tar_path.is_file() + + def test_no_filter_params_keeps_all_tiles(self, tmp_path: Path): + result = _make_tiling_result(num_tiles=2) + mock_wsi = MagicMock() + # Even white tiles should be kept when no filter_params + mock_wsi.get_patch.side_effect = [ + _solid_patch((255, 255, 255)), + _solid_patch((0, 0, 0)), + ] + + with patch("wholeslidedata.WholeSlideImage", return_value=mock_wsi): + _, out_result = extract_tiles_to_tar( + result, output_dir=tmp_path, filter_params=None + ) + + assert out_result is result # unchanged + + +class TestNeedsPixelFiltering: + def test_no_filtering(self): + from hs2p.api import _needs_pixel_filtering + + assert not _needs_pixel_filtering(FilterConfig()) + + def test_white_only(self): + from hs2p.api import _needs_pixel_filtering + + assert _needs_pixel_filtering(FilterConfig(filter_white=True)) + + def test_black_only(self): + from hs2p.api import _needs_pixel_filtering + + assert _needs_pixel_filtering(FilterConfig(filter_black=True)) + + def test_both(self): + from hs2p.api import _needs_pixel_filtering + + assert _needs_pixel_filtering( + FilterConfig(filter_white=True, filter_black=True) + ) diff --git a/tests/test_tiling_api.py b/tests/test_tiling_api.py index 3d2498e..46f73dc 100644 --- a/tests/test_tiling_api.py +++ b/tests/test_tiling_api.py @@ -329,14 +329,14 @@ def test_save_tiling_result_writes_expected_npz_and_json(tmp_path: Path): assert artifacts == TilingArtifacts( sample_id="slide-2", - tiles_npz_path=tmp_path / "coordinates" / "slide-2.tiles.npz", - tiles_meta_path=tmp_path / "coordinates" / "slide-2.tiles.meta.json", + coordinates_npz_path=tmp_path / "tiles" / "slide-2.coordinates.npz", + coordinates_meta_path=tmp_path / "tiles" / "slide-2.coordinates.meta.json", num_tiles=2, mask_preview_path=None, tiling_preview_path=None, ) - tiles = np.load(artifacts.tiles_npz_path, allow_pickle=False) + tiles = np.load(artifacts.coordinates_npz_path, allow_pickle=False) assert set(tiles.files) == {"tile_index", "x", "y", "tissue_fraction"} np.testing.assert_array_equal(tiles["tile_index"], np.array([0, 1], dtype=np.int32)) np.testing.assert_array_equal(tiles["x"], np.array([10, 30], dtype=np.int64)) @@ -346,7 +346,7 @@ def test_save_tiling_result_writes_expected_npz_and_json(tmp_path: Path): np.array([0.3, 0.7], dtype=np.float32), ) - meta = json.loads(artifacts.tiles_meta_path.read_text()) + meta = json.loads(artifacts.coordinates_meta_path.read_text()) assert set(meta) == { "backend", "config_hash", @@ -404,7 +404,7 @@ def test_save_and_load_tiling_result_round_trip(tmp_path: Path): ) artifacts = save_tiling_result(result, output_dir=tmp_path) - loaded = load_tiling_result(artifacts.tiles_npz_path, artifacts.tiles_meta_path) + loaded = load_tiling_result(artifacts.coordinates_npz_path, artifacts.coordinates_meta_path) assert loaded.sample_id == result.sample_id assert loaded.image_path == result.image_path @@ -490,18 +490,18 @@ def _fake_compute_tiling_result( config_hash="hash", ) - def _fake_save_tiling_result(result, output_dir, coordinates_dir=None): - del coordinates_dir - coordinates_dir = Path(output_dir) / "coordinates" - coordinates_dir.mkdir(parents=True, exist_ok=True) - npz_path = coordinates_dir / f"{result.sample_id}.tiles.npz" - meta_path = coordinates_dir / f"{result.sample_id}.tiles.meta.json" + def _fake_save_tiling_result(result, output_dir, tiles_dir=None): + del tiles_dir + tiles_dir = Path(output_dir) / "tiles" + tiles_dir.mkdir(parents=True, exist_ok=True) + npz_path = tiles_dir / f"{result.sample_id}.coordinates.npz" + meta_path = tiles_dir / f"{result.sample_id}.coordinates.meta.json" npz_path.write_bytes(b"npz") meta_path.write_text("{}") return TilingArtifacts( sample_id=result.sample_id, - tiles_npz_path=npz_path, - tiles_meta_path=meta_path, + coordinates_npz_path=npz_path, + coordinates_meta_path=meta_path, num_tiles=result.num_tiles, ) @@ -569,10 +569,10 @@ def test_tile_slides_uses_slide_level_pool_and_preserves_input_order( def _fake_compute_and_save(request): seen["inner_workers"].append(request.num_workers) - coordinates_dir = Path(request.output_dir) / "coordinates" - coordinates_dir.mkdir(parents=True, exist_ok=True) - npz_path = coordinates_dir / f"{request.whole_slide.sample_id}.tiles.npz" - meta_path = coordinates_dir / f"{request.whole_slide.sample_id}.tiles.meta.json" + tiles_dir = Path(request.output_dir) / "tiles" + tiles_dir.mkdir(parents=True, exist_ok=True) + npz_path = tiles_dir / f"{request.whole_slide.sample_id}.coordinates.npz" + meta_path = tiles_dir / f"{request.whole_slide.sample_id}.coordinates.meta.json" npz_path.write_bytes(b"npz") meta_path.write_text("{}") return SimpleNamespace( @@ -581,8 +581,8 @@ def _fake_compute_and_save(request): ok=True, artifact=TilingArtifacts( sample_id=request.whole_slide.sample_id, - tiles_npz_path=npz_path, - tiles_meta_path=meta_path, + coordinates_npz_path=npz_path, + coordinates_meta_path=meta_path, num_tiles=1, ), mask_preview_path=None, @@ -653,10 +653,10 @@ def test_tile_slides_assigns_inner_workers_when_batch_is_small( def _fake_compute_and_save(request): seen["inner_workers"].append(request.num_workers) - coordinates_dir = Path(request.output_dir) / "coordinates" - coordinates_dir.mkdir(parents=True, exist_ok=True) - npz_path = coordinates_dir / f"{request.whole_slide.sample_id}.tiles.npz" - meta_path = coordinates_dir / f"{request.whole_slide.sample_id}.tiles.meta.json" + tiles_dir = Path(request.output_dir) / "tiles" + tiles_dir.mkdir(parents=True, exist_ok=True) + npz_path = tiles_dir / f"{request.whole_slide.sample_id}.coordinates.npz" + meta_path = tiles_dir / f"{request.whole_slide.sample_id}.coordinates.meta.json" npz_path.write_bytes(b"npz") meta_path.write_text("{}") return SimpleNamespace( @@ -665,8 +665,8 @@ def _fake_compute_and_save(request): ok=True, artifact=TilingArtifacts( sample_id=request.whole_slide.sample_id, - tiles_npz_path=npz_path, - tiles_meta_path=meta_path, + coordinates_npz_path=npz_path, + coordinates_meta_path=meta_path, num_tiles=1, ), mask_preview_path=None, @@ -767,7 +767,7 @@ def test_save_tiling_result_cleans_up_partial_outputs_when_metadata_write_fails( monkeypatch, tmp_path: Path ): result = _build_result(sample_id="slide-clean", image_path="slide-clean.svs") - coordinates_dir = tmp_path / "coordinates" + tiles_dir = tmp_path / "tiles" def _raise_json(*args, **kwargs): raise RuntimeError("json failure") @@ -777,9 +777,9 @@ def _raise_json(*args, **kwargs): with pytest.raises(RuntimeError, match="json failure"): save_tiling_result(result, output_dir=tmp_path) - assert not (coordinates_dir / "slide-clean.tiles.npz").exists() - assert not (coordinates_dir / "slide-clean.tiles.meta.json").exists() - assert list(coordinates_dir.glob("*")) == [] + assert not (tiles_dir / "slide-clean.coordinates.npz").exists() + assert not (tiles_dir / "slide-clean.coordinates.meta.json").exists() + assert list(tiles_dir.glob("*")) == [] def test_tile_slide_rejects_tissue_fraction_shape_mismatch( @@ -817,8 +817,8 @@ def test_validate_tiling_artifacts_rejects_mismatched_hash(tmp_path: Path): with pytest.raises(ValueError, match="config_hash"): validate_tiling_artifacts( whole_slide=SlideSpec(sample_id="slide-3", image_path=Path("slide-3.svs")), - tiles_npz_path=artifacts.tiles_npz_path, - tiles_meta_path=artifacts.tiles_meta_path, + coordinates_npz_path=artifacts.coordinates_npz_path, + coordinates_meta_path=artifacts.coordinates_meta_path, expected_config_hash="different-hash", ) @@ -832,8 +832,8 @@ def test_validate_tiling_artifacts_rejects_mismatched_image_path(tmp_path: Path) whole_slide=SlideSpec( sample_id="slide-4", image_path=Path("requested-slide.svs") ), - tiles_npz_path=artifacts.tiles_npz_path, - tiles_meta_path=artifacts.tiles_meta_path, + coordinates_npz_path=artifacts.coordinates_npz_path, + coordinates_meta_path=artifacts.coordinates_meta_path, expected_config_hash="actual-hash", ) @@ -853,8 +853,8 @@ def test_validate_tiling_artifacts_rejects_mismatched_mask_path(tmp_path: Path): image_path=Path("slide-5.svs"), mask_path=Path("requested-mask.png"), ), - tiles_npz_path=artifacts.tiles_npz_path, - tiles_meta_path=artifacts.tiles_meta_path, + coordinates_npz_path=artifacts.coordinates_npz_path, + coordinates_meta_path=artifacts.coordinates_meta_path, expected_config_hash="actual-hash", ) @@ -892,15 +892,15 @@ def _unexpected_extract(**kwargs): segmentation=segmentation_config, filtering=filter_config, output_dir=tmp_path / "run", - read_tiles_from=precomputed_artifacts.tiles_npz_path.parent, + read_coordinates_from=precomputed_artifacts.coordinates_npz_path.parent, resume=False, ) assert len(artifacts) == 1 artifact = artifacts[0] assert isinstance(artifact, TilingArtifacts) - assert artifact.tiles_npz_path == precomputed_artifacts.tiles_npz_path - assert artifact.tiles_meta_path == precomputed_artifacts.tiles_meta_path + assert artifact.coordinates_npz_path == precomputed_artifacts.coordinates_npz_path + assert artifact.coordinates_meta_path == precomputed_artifacts.coordinates_meta_path assert artifact.num_tiles == 2 process_df = pd.read_csv(tmp_path / "run" / "process_list.csv") @@ -910,8 +910,9 @@ def _unexpected_extract(**kwargs): "mask_path", "tiling_status", "num_tiles", - "tiles_npz_path", - "tiles_meta_path", + "coordinates_npz_path", + "coordinates_meta_path", + "tiles_tar_path", "error", "traceback", ] @@ -921,8 +922,8 @@ def _unexpected_extract(**kwargs): assert pd.isna(row["mask_path"]) assert row["tiling_status"] == "success" assert row["num_tiles"] == 2 - assert row["tiles_npz_path"] == str(precomputed_artifacts.tiles_npz_path) - assert row["tiles_meta_path"] == str(precomputed_artifacts.tiles_meta_path) + assert row["coordinates_npz_path"] == str(precomputed_artifacts.coordinates_npz_path) + assert row["coordinates_meta_path"] == str(precomputed_artifacts.coordinates_meta_path) assert pd.isna(row["error"]) assert pd.isna(row["traceback"]) @@ -1042,8 +1043,8 @@ def test_tile_slides_resume_marks_stale_artifact_as_failed( "mask_path": np.nan, "tiling_status": "success", "num_tiles": 1, - "tiles_npz_path": str(artifacts.tiles_npz_path), - "tiles_meta_path": str(artifacts.tiles_meta_path), + "coordinates_npz_path": str(artifacts.coordinates_npz_path), + "coordinates_meta_path": str(artifacts.coordinates_meta_path), "error": np.nan, "traceback": np.nan, } @@ -1072,8 +1073,8 @@ def test_tile_slides_resume_marks_stale_artifact_as_failed( assert row["sample_id"] == "slide-6" assert row["tiling_status"] == "failed" assert row["num_tiles"] == 0 - assert pd.isna(row["tiles_npz_path"]) - assert pd.isna(row["tiles_meta_path"]) + assert pd.isna(row["coordinates_npz_path"]) + assert pd.isna(row["coordinates_meta_path"]) assert "image_path mismatch" in row["error"] @@ -1120,8 +1121,8 @@ def test_tile_slides_computes_resume_hash_once_per_batch( "mask_path": np.nan, "tiling_status": "success", "num_tiles": 1, - "tiles_npz_path": "slide-1.tiles.npz", - "tiles_meta_path": "slide-1.tiles.meta.json", + "coordinates_npz_path": "slide-1.coordinates.npz", + "coordinates_meta_path": "slide-1.coordinates.meta.json", "error": np.nan, "traceback": np.nan, }, @@ -1131,8 +1132,8 @@ def test_tile_slides_computes_resume_hash_once_per_batch( "mask_path": np.nan, "tiling_status": "success", "num_tiles": 1, - "tiles_npz_path": "slide-2.tiles.npz", - "tiles_meta_path": "slide-2.tiles.meta.json", + "coordinates_npz_path": "slide-2.coordinates.npz", + "coordinates_meta_path": "slide-2.coordinates.meta.json", "error": np.nan, "traceback": np.nan, }, @@ -1149,8 +1150,8 @@ def _fake_validate_tiling_artifacts(**kwargs): sample_id = kwargs["whole_slide"].sample_id return TilingArtifacts( sample_id=sample_id, - tiles_npz_path=Path(f"{sample_id}.tiles.npz"), - tiles_meta_path=Path(f"{sample_id}.tiles.meta.json"), + coordinates_npz_path=Path(f"{sample_id}.coordinates.npz"), + coordinates_meta_path=Path(f"{sample_id}.coordinates.meta.json"), num_tiles=1, ) @@ -1256,8 +1257,8 @@ def test_load_csv_rejects_duplicate_sample_id(tmp_path: Path): def test_load_tiling_result_rejects_missing_npz_keys(tmp_path: Path): - npz_path = tmp_path / "broken.tiles.npz" - meta_path = tmp_path / "broken.tiles.meta.json" + npz_path = tmp_path / "broken.coordinates.npz" + meta_path = tmp_path / "broken.coordinates.meta.json" np.savez( npz_path, tile_index=np.array([0], dtype=np.int32), @@ -1289,8 +1290,8 @@ def test_load_tiling_result_rejects_missing_npz_keys(tmp_path: Path): def test_load_tiling_result_wraps_corrupt_npz_errors_with_path(tmp_path: Path): - npz_path = tmp_path / "corrupt.tiles.npz" - meta_path = tmp_path / "corrupt.tiles.meta.json" + npz_path = tmp_path / "corrupt.coordinates.npz" + meta_path = tmp_path / "corrupt.coordinates.meta.json" npz_path.write_bytes(b"not a valid npz") meta_path.write_text( json.dumps( @@ -1314,14 +1315,14 @@ def test_load_tiling_result_wraps_corrupt_npz_errors_with_path(tmp_path: Path): ) with pytest.raises( - ValueError, match=r"Unable to load tiling npz artifact .*corrupt\.tiles\.npz" + ValueError, match=r"Unable to load tiling npz artifact .*corrupt\.coordinates\.npz" ): load_tiling_result(npz_path, meta_path) def test_load_tiling_result_rejects_missing_meta_keys(tmp_path: Path): - npz_path = tmp_path / "broken-meta.tiles.npz" - meta_path = tmp_path / "broken-meta.tiles.meta.json" + npz_path = tmp_path / "broken-meta.coordinates.npz" + meta_path = tmp_path / "broken-meta.coordinates.meta.json" np.savez( npz_path, tile_index=np.array([0], dtype=np.int32),