Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ You can also upload your own pyramidal WSI (up to 1 GB).
pip install hs2p
```

Optional CuCIM install for faster tile tar export when using `tiling.backend="cucim"`:

```bash
pip install cucim-cu12
```

Use the CuCIM wheel that matches your CUDA runtime. The base `hs2p` install does not
require CuCIM.

## Workflows

### Tiling
Expand Down Expand Up @@ -122,6 +131,11 @@ For a first run, start from [hs2p/configs/default.yaml](hs2p/configs/default.yam
- `tiling.params.target_spacing_um`
- `tiling.params.target_tile_size_px`

Optional:

- `save_tiles`
- also write `tiles/{sample_id}.tiles.tar` archives; with `tiling.backend="cucim"` this uses batched CuCIM reads during tar extraction

Run tiling:

```bash
Expand Down
22 changes: 21 additions & 1 deletion docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@ Run sampling:
python -m hs2p.sampling --config-file /path/to/config.yaml
```

Optional CuCIM install for faster tar export with `save_tiles: true` and
`tiling.backend: cucim`:

```bash
pip install cucim-cu12
```

Use the CuCIM wheel that matches your CUDA runtime. Non-CuCIM backends continue to
use the default sequential tile export path.

## Progress UX

When stdout is an interactive terminal, `hs2p` uses `rich` to show live progress for both CLI entrypoints.
Expand Down Expand Up @@ -85,8 +95,10 @@ Detailed logs still go to `output_dir/logs/log.txt`, which is the best place to
- Annotation-specific sampling rules for `hs2p.sampling`
- `save_previews`
- Global switch for writing mask and tiling previews to disk
- `save_tiles`
- Global switch for writing `tiles/{sample_id}.tiles.tar` alongside coordinate artifacts
- `speed.num_workers`
- Parallelism for slide processing
- Parallelism for slide processing, and the per-slide worker budget reused by CuCIM batched tile extraction when `tiling.backend: cucim`

## Sampling-specific settings

Expand Down Expand Up @@ -120,6 +132,14 @@ These filters are **disabled by default** and should stay off unless your datase

When enabled, every candidate tile that passes the tissue mask check is read from the slide at full resolution and its pixel values inspected. This is the **only step in the tiling pipeline that reads actual tile pixel data**. For slides with large internal JPEG tiles (common in some scanner formats), each read triggers a full JPEG decode of the underlying tile block — which can be an order of magnitude slower than the rest of the pipeline per slide.

### Saved tile export (`save_tiles`)

When `save_tiles: true`, HS2P also writes a `tiles/{sample_id}.tiles.tar` archive with JPEG-encoded tile images.

- For non-CuCIM backends, tar extraction uses the existing sequential reader.
- For `tiling.backend: cucim`, tar extraction uses a CuCIM batch-read fast path and reuses the per-slide worker count from `speed.num_workers`.
- Installing CuCIM is optional. If `backend: cucim` is selected but CuCIM is not installed, HS2P falls back to the sequential export path and emits a warning.

## Resume and precomputed artifacts

- `resume: true` expects the current `process_list.csv` schema and current-format artifacts
Expand Down
89 changes: 78 additions & 11 deletions hs2p/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import importlib
import io
import json
import multiprocessing as mp
Expand Down Expand Up @@ -487,22 +488,20 @@ def extract_tiles_to_tar(
jpeg_quality: int = 90,
tiles_dir: Path | None = None,
filter_params: FilterConfig | None = None,
num_workers: int = 4,
) -> tuple[Path, TilingResult]:
"""Extract tile images from a WSI and save them as a JPEG tar archive.

When *filter_params* requests white/black filtering the tiles are checked
during extraction so that pixel data is read only once. The returned
``TilingResult`` has its coordinate arrays trimmed to the surviving tiles.
"""
import wholeslidedata as wsd
from PIL import Image

tiles_dir = Path(tiles_dir) if tiles_dir is not None else Path(output_dir) / "tiles"
tiles_dir.mkdir(parents=True, exist_ok=True)
tar_path = tiles_dir / f"{result.sample_id}.tiles.tar"

wsi = wsd.WholeSlideImage(result.image_path, backend=result.backend)

do_filter_white = filter_params is not None and filter_params.filter_white
do_filter_black = filter_params is not None and filter_params.filter_black
white_thresh = getattr(filter_params, "white_threshold", 220) if filter_params else 220
Expand All @@ -520,15 +519,12 @@ def extract_tiles_to_tar(
temp_tar_path = Path(tmp.name)

with tarfile.open(temp_tar_path, "w") as tf:
for i in range(result.num_tiles):
tile_arr = wsi.get_patch(
int(result.x[i]),
int(result.y[i]),
int(result.read_tile_size_px),
int(result.read_tile_size_px),
spacing=float(result.read_spacing_um),
center=False,
for i, tile_arr in enumerate(
_iter_tile_arrays_for_tar_extraction(
result=result,
num_workers=num_workers,
)
):
if tile_arr.shape[2] > 3:
tile_arr = tile_arr[:, :, :3]

Expand Down Expand Up @@ -601,6 +597,76 @@ def extract_tiles_to_tar(
return tar_path, filtered_result


def _iter_tile_arrays_for_tar_extraction(
*,
result: TilingResult,
num_workers: int,
):
tile_arrays = _iter_cucim_tile_arrays_for_tar_extraction(
result=result,
num_workers=num_workers,
)
if tile_arrays is not None:
yield from tile_arrays
return
yield from _iter_wsd_tile_arrays_for_tar_extraction(result=result)


def _iter_cucim_tile_arrays_for_tar_extraction(
*,
result: TilingResult,
num_workers: int,
):
if result.backend != "cucim":
return None
try:
cucim = importlib.import_module("cucim")
except ModuleNotFoundError:
warnings.warn(
"CuCIM is unavailable for backend='cucim'; falling back to sequential wholeslidedata tile extraction.",
UserWarning,
stacklevel=2,
)
return None

cu_image = cucim.CuImage(str(result.image_path))
locations = [
(int(x), int(y))
for x, y in zip(
result.x.astype(np.int64, copy=False).tolist(),
result.y.astype(np.int64, copy=False).tolist(),
)
]
read_size = (int(result.read_tile_size_px), int(result.read_tile_size_px))
return (
np.asarray(region)
for region in cu_image.read_region(
locations,
read_size,
level=int(result.read_level),
num_workers=max(1, int(num_workers)),
)
)


def _iter_wsd_tile_arrays_for_tar_extraction(
*,
result: TilingResult,
):
import wholeslidedata as wsd

wsi = wsd.WholeSlideImage(result.image_path, backend=result.backend)
for i in range(result.num_tiles):
yield wsi.get_patch(
int(result.x[i]),
int(result.y[i]),
int(result.read_tile_size_px),
int(result.read_tile_size_px),
spacing=float(result.read_spacing_um),
center=False,
)


def _needs_pixel_filtering(filtering: FilterConfig) -> bool:
return bool(filtering.filter_white or filtering.filter_black)

Expand Down Expand Up @@ -1000,6 +1066,7 @@ def _compute_tiling_result_from_request(
result,
output_dir=request.output_dir,
filter_params=request.filtering if _needs_pixel_filtering(request.filtering) else None,
num_workers=request.num_workers,
)
artifact = save_tiling_result(result, output_dir=request.output_dir)
artifact = TilingArtifacts(
Expand Down
1 change: 1 addition & 0 deletions hs2p/configs/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ resume: false # resume from a previous run
resume_dirname: # directory name to resume from

save_previews: true # save preview images of slide tiling and mask overlays
save_tiles: false # save extracted tiles as {sample_id}.tiles.tar in addition to coordinate artifacts

seed: 0 # seed for reproducibility

Expand Down
1 change: 1 addition & 0 deletions hs2p/tiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def main(args):
num_workers=cfg.speed.num_workers,
resume=cfg.resume,
read_coordinates_from=read_coordinates_from,
save_tiles=bool(getattr(cfg, "save_tiles", False)),
)
pd.read_csv(output_dir / "process_list.csv")
progress.emit_progress(
Expand Down
4 changes: 4 additions & 0 deletions tests/test_cli_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def _base_cfg(tmp_path: Path, csv_path: Path) -> SimpleNamespace:
output_dir=str(tmp_path / "output"),
resume=False,
save_previews=False,
save_tiles=False,
speed=SimpleNamespace(num_workers=1),
tiling=SimpleNamespace(
read_coordinates_from=None,
Expand Down Expand Up @@ -102,9 +103,11 @@ def _fake_tile_slides(
num_workers,
resume,
read_coordinates_from,
save_tiles,
):
del tiling, segmentation, filtering, preview, num_workers, resume, read_coordinates_from
captured["whole_slides"] = whole_slides
captured["save_tiles"] = save_tiles
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
process_df = pd.DataFrame(
Expand Down Expand Up @@ -148,6 +151,7 @@ def _fake_tile_slides(
mask_path=Path("slide-1-mask.png"),
)
]
assert captured["save_tiles"] is False
process_df = pd.read_csv(Path(cfg.output_dir) / "process_list.csv")
assert list(process_df.columns) == [
"sample_id",
Expand Down
71 changes: 71 additions & 0 deletions tests/test_tile_extraction.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Tests for extract_tiles_to_tar() and the save_tiles pipeline option."""

import io
import types
import tarfile
from pathlib import Path
from unittest.mock import MagicMock, patch

import numpy as np
import pytest
from PIL import Image

from hs2p.api import TilingResult, extract_tiles_to_tar
Expand Down Expand Up @@ -228,6 +230,75 @@ def test_no_filter_params_keeps_all_tiles(self, tmp_path: Path):

assert out_result is result # unchanged

def test_cucim_backend_uses_batched_read_region(self, monkeypatch, tmp_path: Path):
result = _make_tiling_result(num_tiles=2)
result.backend = "cucim"
result.read_level = 3
result.read_tile_size_px = 128

regions = [_solid_patch((10, 20, 30), size=128), _solid_patch((40, 50, 60), size=128)]
mock_cu_image = MagicMock()
mock_cu_image.read_region.return_value = iter(regions)
fake_cucim = types.SimpleNamespace(CuImage=MagicMock(return_value=mock_cu_image))

import hs2p.api as api_mod

monkeypatch.setattr(
api_mod.importlib,
"import_module",
lambda name: fake_cucim if name == "cucim" else None,
)

with patch("wholeslidedata.WholeSlideImage") as mock_wsd:
tar_path, out_result = extract_tiles_to_tar(
result,
output_dir=tmp_path,
num_workers=5,
)

assert tar_path.is_file()
assert out_result is result
fake_cucim.CuImage.assert_called_once_with(str(result.image_path))
mock_cu_image.read_region.assert_called_once_with(
[(0, 0), (256, 0)],
(128, 128),
level=3,
num_workers=5,
)
mock_wsd.assert_not_called()

def test_cucim_backend_falls_back_to_wsd_when_cucim_is_unavailable(
self, monkeypatch, tmp_path: Path
):
result = _make_tiling_result(num_tiles=1)
result.backend = "cucim"

mock_wsi = MagicMock()
mock_wsi.get_patch.return_value = _solid_patch((70, 80, 90))

import hs2p.api as api_mod

def _import_module(name):
if name == "cucim":
raise ModuleNotFoundError("No module named 'cucim'")
raise AssertionError(f"unexpected module import: {name}")

monkeypatch.setattr(api_mod.importlib, "import_module", _import_module)

with pytest.warns(UserWarning, match="CuCIM is unavailable"), patch(
"wholeslidedata.WholeSlideImage",
return_value=mock_wsi,
) as mock_wsd:
tar_path, out_result = extract_tiles_to_tar(
result,
output_dir=tmp_path,
num_workers=4,
)

assert tar_path.is_file()
assert out_result is result
mock_wsd.assert_called_once_with(result.image_path, backend="cucim")


class TestNeedsPixelFiltering:
def test_no_filtering(self):
Expand Down
Loading
Loading