diff --git a/slide2vec/configs/default.yaml b/slide2vec/configs/default.yaml
index 008fcdd..02d3241 100644
--- a/slide2vec/configs/default.yaml
+++ b/slide2vec/configs/default.yaml
@@ -46,6 +46,7 @@ tiling:
     sam2_checkpoint_path: # optional when method="sam2"; if empty, hs2p downloads the default AtlasPatch checkpoint from Hugging Face
     sam2_config_path: # optional local override for the SAM2 model config; if empty, hs2p downloads the default AtlasPatch config from Hugging Face
     sam2_device: "cpu" # device for SAM2 inference, e.g. "cpu", "cuda", or "cuda:0"
+    sam2_num_workers: # optional cap on concurrent SAM2 mask-resolution workers; set to 1 to serialize GPU inference and avoid CUDA OOMs
   filter_params:
     ref_tile_size: ${tiling.params.requested_tile_size_px} # reference tile size at the target spacing
     a_t: 4 # area filter threshold for tissue (positive integer, the minimum size of detected foreground contours to consider, relative to the reference tile size ref_tile_size, e.g. a value 10 means only detected foreground contours of size greater than 10 [ref_tile_size, ref_tile_size] tiles at spacing tiling.params.requested_spacing_um will be kept)
diff --git a/slide2vec/configs/resources.py b/slide2vec/configs/resources.py
index 13f96a0..a6b8a05 100644
--- a/slide2vec/configs/resources.py
+++ b/slide2vec/configs/resources.py
@@ -1,11 +1,10 @@
 from contextlib import contextmanager
-from importlib.resources import as_file, files
 from pathlib import Path
 from typing import Iterator
 
 
 def config_resource(*parts: str):
-    path = files("slide2vec").joinpath("configs")
+    path = Path(__file__).resolve().parent
     for part in parts:
         path = path.joinpath(part)
     return path.with_suffix(".yaml")
@@ -21,7 +20,4 @@ def load_config(*parts: str):
 
 @contextmanager
 def config_path(*parts: str) -> Iterator[Path]:
-    resource = config_resource(*parts)
-    with as_file(resource) as resolved:
-        yield resolved
-
+    yield config_resource(*parts)
diff --git a/slide2vec/inference.py b/slide2vec/inference.py
index 5002955..52ee5a8 100644
--- a/slide2vec/inference.py
+++ b/slide2vec/inference.py
@@ -308,6 +308,11 @@ def embed_slides(
                     slide_count=len(embeddable_slides),
                     num_gpus=execution.num_gpus,
                 )
+                emit_progress(
+                    "embedding.assignment.finished",
+                    slide_count=len(embeddable_slides),
+                    num_gpus=execution.num_gpus,
+                )
             local_persist_callback = None
             if execution.output_dir is not None and execution.num_gpus <= 1:
                 local_persist_callback, _, _ = _build_incremental_persist_callback(
@@ -325,12 +330,6 @@ def embed_slides(
                 work_dir=work_dir,
                 on_embedded_slide=local_persist_callback,
             )
-            if execution.num_gpus > 1 and len(embeddable_slides) > 1:
-                emit_progress(
-                    "embedding.assignment.finished",
-                    slide_count=len(embeddable_slides),
-                    num_gpus=execution.num_gpus,
-                )
             if execution.output_dir is not None and execution.num_gpus > 1:
                 tile_artifacts: list[TileEmbeddingArtifact] = []
                 hierarchical_artifacts: list[HierarchicalEmbeddingArtifact] = []
@@ -2311,6 +2310,11 @@ def _run_distributed_embedding_stage(
         slide_count=len(successful_slides),
         num_gpus=execution.num_gpus,
     )
+    emit_progress(
+        "embedding.assignment.finished",
+        slide_count=len(successful_slides),
+        num_gpus=execution.num_gpus,
+    )
     runtime_distributed.run_torchrun_worker(
         module="slide2vec.distributed.pipeline_worker",
         num_gpus=execution.num_gpus,
@@ -2320,11 +2324,6 @@ def _run_distributed_embedding_stage(
         progress_events_path=progress_events_path,
         popen_factory=runtime_distributed.subprocess.Popen,
     )
-    emit_progress(
-        "embedding.assignment.finished",
-        slide_count=len(successful_slides),
-        num_gpus=execution.num_gpus,
-    )
 
 
 def _embed_single_slide_distributed(
diff --git a/slide2vec/runtime/batching.py b/slide2vec/runtime/batching.py
index c02cdd0..8890ecb 100644
--- a/slide2vec/runtime/batching.py
+++ b/slide2vec/runtime/batching.py
@@ -7,6 +7,7 @@
 
 import torch
 from transformers.image_processing_utils import BaseImageProcessor
+from torchvision.transforms.functional import to_pil_image
 
 from slide2vec.progress import emit_progress
 from slide2vec.runtime.types import LoadedModel
@@ -154,11 +155,23 @@ def prepare_batch_tensor(image):
     return image.float()
 
 
+def _apply_transform_sample(sample, transforms):
+    if not torch.is_tensor(sample):
+        return transforms(sample)
+    try:
+        return transforms(sample)
+    except AttributeError as exc:
+        message = str(exc)
+        if "convert" not in message and "Tensor" not in message:
+            raise
+        return transforms(to_pil_image(sample.cpu()))
+
+
 def apply_transforms_itemwise(image, transforms):
     if not torch.is_tensor(image) or image.ndim <= 3:
-        return transforms(image)
+        return _apply_transform_sample(image, transforms)
 
-    transformed_items = [transforms(sample) for sample in image.cpu()]
+    transformed_items = [_apply_transform_sample(sample, transforms) for sample in image.cpu()]
     if not transformed_items:
         return image.new_empty((0,), dtype=torch.float32)
     if not all(torch.is_tensor(item) for item in transformed_items):
diff --git a/tasks/lessons.md b/tasks/lessons.md
index 3b7b027..e16aa19 100644
--- a/tasks/lessons.md
+++ b/tasks/lessons.md
@@ -7,6 +7,14 @@
 
 - When slide2vec depends on bridged HS2P progress events, keep the bridge whitelist in sync with every reporter stage the UI renders; otherwise the code can define a preview bar and still never receive preview events.
 
+## 2026-04-21
+
+- When an itemwise preprocessing fallback must support both tensor-native transforms and PIL-only transforms, retry through PIL only after the tensor path actually fails with a PIL-style attribute error; do not force all tensor samples through PIL up front and break legitimate tensor transforms.
+
+## 2026-04-20
+
+- When a progress bar is only meant to cover scheduling or assignment, emit its `finished` event before the downstream GPU work starts; otherwise the UI makes the orchestration phase look like it is still active while encoding is already running.
+
 ## 2026-04-18
 
 - Keep `tiling.finished` for closing the live bar and emit the final summary on a separate `tiling.summary` event; otherwise the reporter ends up printing the same panel twice.
diff --git a/tests/test_progress.py b/tests/test_progress.py
index 292baa3..6138a0b 100644
--- a/tests/test_progress.py
+++ b/tests/test_progress.py
@@ -295,90 +295,48 @@ def _emit_tiling_summary(*args, **kwargs):
     ]
 
 
-def test_run_pipeline_emits_assignment_progress_for_multi_gpu_embedding(monkeypatch, tmp_path: Path):
+def test_distributed_embedding_stage_finishes_assignment_before_embedding_starts(
+    monkeypatch, tmp_path: Path
+):
     import slide2vec.inference as inference
     import slide2vec.progress as progress
 
     reporter = RecordingReporter()
 
-    slide_a = SimpleNamespace(
-        sample_id="slide-a",
-        image_path=Path("/tmp/slide-a.svs"),
-        mask_path=None,
-        spacing_at_level_0=None,
-    )
-    slide_b = SimpleNamespace(
-        sample_id="slide-b",
-        image_path=Path("/tmp/slide-b.svs"),
-        mask_path=None,
-        spacing_at_level_0=None,
-    )
-    tiling_a = SimpleNamespace(x=np.array([0, 1]), y=np.array([0, 1]), tile_size_lv0=224)
-    tiling_b = SimpleNamespace(x=np.array([0, 1, 2]), y=np.array([0, 1, 2]), tile_size_lv0=224)
-    embedded_a = SimpleNamespace(sample_id="slide-a")
-    embedded_b = SimpleNamespace(sample_id="slide-b")
+    def _fake_run_torchrun_worker(*args, **kwargs):
+        progress.emit_progress(
+            "embedding.slide.started",
+            sample_id="slide-a",
+            total_tiles=5,
+            progress_label="cuda:0",
+        )
 
+    monkeypatch.setattr(inference.runtime_distributed, "run_torchrun_worker", _fake_run_torchrun_worker)
+    monkeypatch.setattr(inference.runtime_distributed, "reset_progress_event_logs", lambda *args, **kwargs: None)
     monkeypatch.setattr(
         inference,
-        "_prepare_tiled_slides",
-        lambda *args, **kwargs: ([slide_a, slide_b], [tiling_a, tiling_b], tmp_path / "process_list.csv"),
-    )
-    monkeypatch.setattr(
-        inference,
-        "_select_embedding_path",
-        lambda *args, **kwargs: [embedded_a, embedded_b],
-    )
-    monkeypatch.setattr(inference, "_persist_embedded_slide", lambda *args, **kwargs: (None, None))
-    monkeypatch.setattr(inference.runtime_distributed, "run_torchrun_worker", lambda *args, **kwargs: None)
-    monkeypatch.setattr(
-        inference,
-        "_collect_pipeline_artifacts",
-        lambda *args, **kwargs: (["tile-artifact"], [], ["slide-artifact"]),
-    )
-    monkeypatch.setattr(inference, "_update_process_list_after_embedding", lambda *args, **kwargs: None)
-    monkeypatch.setattr(inference, "_validate_multi_gpu_execution", lambda *args, **kwargs: None)
-    monkeypatch.setattr(
-        inference,
-        "_emit_tiling_summary",
-        lambda *args, **kwargs: progress.emit_progress(
-            "tiling.summary",
-            total=2,
-            completed=2,
-            failed=0,
-            pending=0,
-            discovered_tiles=5,
-        ),
+        "_build_pipeline_worker_request_payload",
+        lambda *args, **kwargs: {},
     )
 
-    model = SimpleNamespace(
-        name="prism",
-        level="slide",
-        _requested_device="cuda:0",
-        _load_backend=lambda: SimpleNamespace(),
-    )
+    model = SimpleNamespace(name="prism", level="slide", _requested_device="cuda:0")
 
     with progress.activate_progress_reporter(reporter):
-        result = inference.run_pipeline(
+        inference._run_distributed_embedding_stage(
             model,
-            slides=[slide_a, slide_b],
+            successful_slides=[
+                SimpleNamespace(sample_id="slide-a"),
+                SimpleNamespace(sample_id="slide-b"),
+            ],
             preprocessing=DEFAULT_PREPROCESSING,
             execution=inference.ExecutionOptions(output_dir=tmp_path, num_gpus=2, save_tile_embeddings=True),
+            output_dir=tmp_path,
         )
 
     kinds = [event.kind for event in reporter.events]
-
-    assert result.tile_artifacts == ["tile-artifact"]
-    assert result.slide_artifacts == ["slide-artifact"]
-    assert kinds == [
-        "run.started",
-        "tiling.started",
-        "tiling.summary",
-        "embedding.started",
-        "embedding.assignment.started",
-        "embedding.assignment.finished",
-        "embedding.finished",
-        "run.finished",
-    ]
+    assert kinds.count("embedding.assignment.started") == 1
+    assert kinds.count("embedding.assignment.finished") == 1
+    assert kinds.count("embedding.slide.started") == 1
 
 
 def test_plain_text_reporter_formats_assignment_progress():
diff --git a/tests/test_regression_core.py b/tests/test_regression_core.py
index 98d9349..f41b7ed 100644
--- a/tests/test_regression_core.py
+++ b/tests/test_regression_core.py
@@ -47,6 +47,7 @@ def test_packaged_preprocessing_config_matches_hs2p_4_tiling_schema():
     assert hasattr(cfg.tiling.seg_params, "sam2_checkpoint_path")
     assert hasattr(cfg.tiling.seg_params, "sam2_config_path")
     assert hasattr(cfg.tiling.seg_params, "sam2_device")
+    assert "sam2_num_workers:" in (ROOT / "slide2vec" / "configs" / "default.yaml").read_text()
     assert hasattr(cfg.tiling.preview, "save_mask_preview")
     assert hasattr(cfg.tiling.preview, "save_tiling_preview")
     assert hasattr(cfg.tiling.preview, "tissue_contour_color")
@@ -388,8 +389,8 @@ def test_cpu_worker_limit_caps_large_cpu_budget_to_sixty_four(monkeypatch):
 
     assert utils.cpu_worker_limit() == 64
 
-def test_execution_options_default_batch_size_is_one():
-    assert ExecutionOptions().batch_size == 1
+def test_execution_options_default_batchis_thirty_two():
+    assert ExecutionOptions().batch_size == 32
 
 def test_execution_options_default_num_workers_is_auto():
     assert ExecutionOptions().num_workers is None