clemsgrs · clemsgrs · Mar 23, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 19, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -1,3 +1,3 @@
 data/
 output/
-docker/
+outputs/
diff --git a/.gitignore b/.gitignore
@@ -168,3 +168,4 @@ archive/
 tasks/
 docs/documentation.md
 docs/20*-*.md
+data/
diff --git a/Dockerfile b/Dockerfile
@@ -1,11 +1,10 @@
 ARG UBUNTU_VERSION=22.04
-ARG CUDA_MAJOR_VERSION=11.8.0
-ARG CUDNN_MAJOR_VERSION=8
+ARG CUDA_MAJOR_VERSION=12.8.1
 
 ########################
 # Stage 1: build stage #
 ########################
-FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn${CUDNN_MAJOR_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build
+FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
 
 ARG USER_UID=1001
 ARG USER_GID=1001
@@ -29,6 +28,7 @@ ENV PATH="/home/user/.local/bin:${PATH}"
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libtiff-dev \
+    cmake \
     zlib1g-dev \
     curl \
     vim screen \
@@ -40,6 +40,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+# libjpeg-turbo 3.x (required by PyTurboJPEG>=2)
+ARG LIBJPEG_TURBO_VERSION=3.1.0
+RUN curl -fsSL https://github.com/libjpeg-turbo/libjpeg-turbo/releases/download/${LIBJPEG_TURBO_VERSION}/libjpeg-turbo-${LIBJPEG_TURBO_VERSION}.tar.gz \
+      | tar xz -C /tmp \
+    && cd /tmp/libjpeg-turbo-${LIBJPEG_TURBO_VERSION} \
+    && cmake -G"Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr/local . \
+    && make -j"$(nproc)" && make install \
+    && ldconfig \
+    && rm -rf /tmp/libjpeg-turbo-${LIBJPEG_TURBO_VERSION}
+
 WORKDIR /opt/app/
 
 # core deps live in requirements.in; model runtime extras live in requirements-models.in
@@ -70,7 +80,7 @@ RUN python -m pip install 'flash-attn>=2.7.1,<=2.8.0' --no-build-isolation
 ##########################
 # Stage 2: runtime stage #
 ##########################
-FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn${CUDNN_MAJOR_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION}
 
 ARG USER_UID=1001
 ARG USER_GID=1001
@@ -104,6 +114,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+# libjpeg-turbo 3.x (copied from build stage)
+COPY --from=build /usr/local/lib/libjpeg* /usr/local/lib/
+COPY --from=build /usr/local/lib/libturbojpeg* /usr/local/lib/
+RUN ldconfig
+
 # install ASAP
 ARG ASAP_URL=https://github.com/computationalpathologygroup/ASAP/releases/download/ASAP-2.2-(Nightly)/ASAP-2.2-Ubuntu2204.deb
 RUN apt-get update && curl -L ${ASAP_URL} -o /tmp/ASAP.deb && apt-get install --assume-yes /tmp/ASAP.deb && \
@@ -116,6 +131,10 @@ RUN apt-get update && curl -L ${ASAP_URL} -o /tmp/ASAP.deb && apt-get install --
 COPY --from=build /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
 COPY --from=build /usr/local/bin /usr/local/bin
 
+# register libnvimgcodec so cucim can use GPU-accelerated JPEG decoding
+RUN echo "/usr/local/lib/python3.10/dist-packages/nvidia/nvimgcodec" > /etc/ld.so.conf.d/nvimgcodec.conf && \
+    ldconfig
+
 # copy app code
 COPY --from=build /opt/app /opt/app
 

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -21,6 +21,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libtiff-dev \
     zlib1g-dev \
     curl \
+    cmake \
     vim screen \
     zip unzip \
     git \
@@ -31,6 +32,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+# libjpeg-turbo 3.x (required by PyTurboJPEG>=2)
+ARG LIBJPEG_TURBO_VERSION=3.1.0
+RUN curl -fsSL https://github.com/libjpeg-turbo/libjpeg-turbo/releases/download/${LIBJPEG_TURBO_VERSION}/libjpeg-turbo-${LIBJPEG_TURBO_VERSION}.tar.gz \
+      | tar xz -C /tmp \
+    && cd /tmp/libjpeg-turbo-${LIBJPEG_TURBO_VERSION} \
+    && cmake -G"Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr/local . \
+    && make -j"$(nproc)" && make install \
+    && ldconfig \
+    && rm -rf /tmp/libjpeg-turbo-${LIBJPEG_TURBO_VERSION}
+
 # ASAP
 ARG ASAP_URL=https://github.com/computationalpathologygroup/ASAP/releases/download/ASAP-2.2-(Nightly)/ASAP-2.2-Ubuntu2204.deb
 RUN set -eux; \
@@ -65,5 +76,9 @@ COPY --chown=user:user LICENSE /opt/app/LICENSE
 
 RUN python -m pip install /opt/app
 
+# register libnvimgcodec so cucim can use GPU-accelerated JPEG decoding
+RUN echo "/usr/local/lib/python3.10/dist-packages/nvidia/nvimgcodec" > /etc/ld.so.conf.d/nvimgcodec.conf && \
+    ldconfig
+
 USER user
 WORKDIR /opt/app
diff --git a/Dockerfile.coding-agents b/Dockerfile.coding-agents
@@ -0,0 +1,156 @@
+ARG UBUNTU_VERSION=22.04
+ARG CUDA_MAJOR_VERSION=12.8.1
+
+########################
+# Stage 1: build stage #
+########################
+FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
+
+ARG USER_UID=1001
+ARG USER_GID=1001
+
+# ensures that Python output to stdout/stderr is not buffered: prevents missing information when terminating
+ENV PYTHONUNBUFFERED=1
+ENV DEBIAN_FRONTEND=noninteractive TZ=Europe/Amsterdam
+
+USER root
+
+RUN groupadd --gid ${USER_GID} user \
+    && useradd -m --no-log-init --uid ${USER_UID} --gid ${USER_GID} user
+
+# create input/output directory
+RUN mkdir /input /output && \
+    chown user:user /input /output
+
+# set /home/user as working directory
+WORKDIR /home/user
+ENV PATH="/home/user/.local/bin:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libtiff-dev \
+    cmake \
+    zlib1g-dev \
+    curl \
+    vim screen \
+    zip unzip \
+    git \
+    openssh-server \
+    python3-pip python3-dev python-is-python3 \
+    && mkdir /var/run/sshd \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# libjpeg-turbo 3.x (required by PyTurboJPEG>=2)
+ARG LIBJPEG_TURBO_VERSION=3.1.0
+RUN curl -fsSL https://github.com/libjpeg-turbo/libjpeg-turbo/releases/download/${LIBJPEG_TURBO_VERSION}/libjpeg-turbo-${LIBJPEG_TURBO_VERSION}.tar.gz \
+      | tar xz -C /tmp \
+    && cd /tmp/libjpeg-turbo-${LIBJPEG_TURBO_VERSION} \
+    && cmake -G"Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr/local . \
+    && make -j"$(nproc)" && make install \
+    && ldconfig \
+    && rm -rf /tmp/libjpeg-turbo-${LIBJPEG_TURBO_VERSION}
+
+WORKDIR /opt/app/
+
+# core deps live in requirements.in; model runtime extras live in requirements-models.in
+RUN python -m pip install --upgrade pip setuptools pip-tools \
+    && rm -rf /home/user/.cache/pip
+
+# install slide2vec
+COPY --chown=user:user requirements.in /opt/app/requirements.in
+COPY --chown=user:user requirements-models.in /opt/app/requirements-models.in
+RUN python -m pip install \
+    --no-cache-dir \
+    --no-color \
+    --requirement /opt/app/requirements-models.in \
+    && rm -rf /home/user/.cache/pip
+
+COPY --chown=user:user slide2vec /opt/app/slide2vec
+COPY --chown=user:user setup.py /opt/app/setup.py
+COPY --chown=user:user setup.cfg /opt/app/setup.cfg
+COPY --chown=user:user pyproject.toml /opt/app/pyproject.toml
+COPY --chown=user:user MANIFEST.in /opt/app/MANIFEST.in
+COPY --chown=user:user README.md /opt/app/README.md
+COPY --chown=user:user LICENSE /opt/app/LICENSE
+
+RUN python -m pip install /opt/app
+RUN python -m pip install 'flash-attn>=2.7.1,<=2.8.0' --no-build-isolation
+
+
+##########################
+# Stage 2: runtime stage #
+##########################
+FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION}
+
+ARG USER_UID=1001
+ARG USER_GID=1001
+
+ENV PYTHONUNBUFFERED=1
+ENV DEBIAN_FRONTEND=noninteractive TZ=Europe/Amsterdam
+
+USER root
+
+RUN groupadd --gid ${USER_GID} user \
+    && useradd -m --no-log-init --uid ${USER_UID} --gid ${USER_GID} user
+
+# create input/output directory
+RUN mkdir /input /output && \
+    chown user:user /input /output
+
+# set /home/user as working directory
+WORKDIR /home/user
+ENV PATH="/home/user/.local/bin:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libtiff-dev \
+    zlib1g-dev \
+    curl \
+    vim screen \
+    zip unzip \
+    git \
+    openssh-server \
+    python3-pip python3-dev python-is-python3 \
+    && mkdir /var/run/sshd \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# libjpeg-turbo 3.x (copied from build stage)
+COPY --from=build /usr/local/lib/libjpeg* /usr/local/lib/
+COPY --from=build /usr/local/lib/libturbojpeg* /usr/local/lib/
+RUN ldconfig
+
+RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
+    && apt-get install -y --no-install-recommends nodejs
+
+# install ASAP
+ARG ASAP_URL=https://github.com/computationalpathologygroup/ASAP/releases/download/ASAP-2.2-(Nightly)/ASAP-2.2-Ubuntu2204.deb
+RUN apt-get update && curl -L ${ASAP_URL} -o /tmp/ASAP.deb && apt-get install --assume-yes /tmp/ASAP.deb && \
+    SITE_PACKAGES=`python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])"` && \
+    printf "/opt/ASAP/bin/\n" > "${SITE_PACKAGES}/asap.pth" && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install codex
+RUN npm i -g @openai/codex
+
+# install claude
+RUN curl -fsSL https://claude.ai/install.sh | bash
+
+# copy Python libs & entrypoints from build stage (includes flash-attn, your deps, ASAP .pth)
+COPY --from=build /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+COPY --from=build /usr/local/bin /usr/local/bin
+
+# register libnvimgcodec so cucim can use GPU-accelerated JPEG decoding
+RUN echo "/usr/local/lib/python3.10/dist-packages/nvidia/nvimgcodec" > /etc/ld.so.conf.d/nvimgcodec.conf && \
+    ldconfig
+
+# copy app code
+COPY --from=build /opt/app /opt/app
+
+# expose port for ssh and jupyter
+EXPOSE 22 8888
+
+WORKDIR /opt/app/
+
+# switch to user
+USER user
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
@@ -0,0 +1,120 @@
+# Benchmarking
+
+`slide2vec` includes a benchmark runner for end-to-end embedding throughput sweeps across different GPU environments and multiple model configs.
+
+The script samples a balanced subset of your manifest, runs untimed warmups plus repeated measured trials, tunes only:
+
+- `model.batch_size`
+- `speed.num_workers_embedding`
+
+It keeps the rest of each model config fixed, disables previews / resume / Weights & Biases, and writes:
+
+- `trial_results.csv`
+- `best_results.csv`
+- `throughput_by_gpu.png`
+- `throughput_by_gpu_and_size.png`
+- `tuning_<gpu>_<model>.png`
+
+Default sweep values:
+
+- `--n-slides 0` to benchmark the full manifest by default
+- `--batch-sizes 1 32 64 128 256`
+- `--embedding-workers 4 8 16 32 64 128`
+
+## Basic Usage
+
+```shell
+python scripts/benchmark_embedding_throughput.py \
+  --config-files /path/to/pathojepa-small.yaml /path/to/pathojepa-base.yaml /path/to/pathojepa-large.yaml \
+  --model-labels PathoJEPA-S PathoJEPA-B PathoJEPA-L \
+  --size-labels S B L \
+  --csv /path/to/slides.csv \
+  --gpu-label "A100-80GB" \
+  --batch-sizes 1 32 64 128 256 \
+  --embedding-workers 4 8 16 32 64 128 \
+  --repeat 3 \
+  --n-slides 0 \
+  --output-dir /tmp/slide2vec-benchmark
+```
+
+Notes:
+
+- the benchmark measures the full `Pipeline.run(...)` path, including tiling
+- stage timings for tiling, embedding, and aggregation are also recorded when progress events are available
+- embedding trials also record per-batch timing summaries from `embedding.batch.timing` events, including mean loader wait, mean ready-wait after async copy/preprocess, mean preprocess time, mean forward time, and a loader-wait fraction
+- every compared model reuses the same sampled manifest within a run
+- each config gets an untimed warmup before measured repeats
+- benchmark config files are loaded through the same default-merge and validation path as the regular CLI, so omitted standard keys inherit the usual defaults
+
+Single-model usage is still supported:
+
+```shell
+python scripts/benchmark_embedding_throughput.py \
+  --config-file /path/to/model-config.yaml \
+  --csv /path/to/slides.csv \
+  --gpu-label "A100-80GB"
+```
+
+In multi-model mode:
+
+- `--config-files` is the primary interface
+- `--model-labels` must match the config count
+- `--size-labels` must match the config count
+- size labels are explicit metadata like `S`, `B`, `L`, `XL`; the script does not infer them
+
+## Merging GPU Runs
+
+Run the benchmark once per GPU environment, then regenerate the cross-GPU comparison chart from multiple `trial_results.csv` files:
+
+```shell
+python scripts/benchmark_embedding_throughput.py \
+  --chart-only \
+  /tmp/a100-benchmark/trial_results.csv \
+  /tmp/h100-benchmark/trial_results.csv \
+  --output-dir /tmp/slide2vec-benchmark-merged
+```
+
+The merged outputs include:
+
+- `throughput_by_gpu.png` for best tuned model entries per GPU
+- `throughput_by_gpu_and_size.png` for grouped GPU-vs-size bars, choosing the winning model for each `(gpu, size)` bucket
+
+Use `--copy-locally` when your slide source lives on network storage and you want to reduce I/O variance during the sweep.
+
+## End-to-End Path Comparison
+
+For a direct full-pipeline comparison between:
+
+- tar-based embedding (`on_the_fly=false`)
+- on-the-fly `wsd_single` embedding (`backend=asap`, `use_supertiles=false`)
+- on-the-fly `cucim_supertiles` embedding
+
+use:
+
+```shell
+python scripts/benchmark_end_to_end_paths.py \
+  --csv /path/to/slides.csv \
+  --config-file /path/to/model-config.yaml \
+  --batch-size 256 \
+  --repeat 1 \
+  --output-dir /tmp/slide2vec-end-to-end
+```
+
+The model is taken from `--config-file`; the script does not accept a separate `--model` override.
+
+This benchmark runs the three paths independently from raw slide input to final embedding artifact and writes:
+
+- `trial_results.csv`
+- `summary.csv`
+- `end_to_end_by_path.png`
+- `stage_breakdown.png`
+- `embedding_subpath_breakdown.png`
+
+The summary also now includes an embedding subpath split derived from per-batch timing
+events:
+
+- `mean_data_pipeline_seconds`: timed embedding seconds spent in loader wait, ready
+  wait, and preprocessing
+- `mean_forward_seconds`: timed embedding seconds spent in model forward
+- `mean_data_pipeline_fraction` / `mean_forward_fraction`: shares of the timed
+  embedding batches accounted for by those two buckets
-Original file line number
+Diff line change
@@ Expand Up / @@ -168,3 +168,4 @@ archive/ @@
     tasks/
     docs/documentation.md
     docs/20*-*.md
+    data/