NVIDIA-NeMo · abhinavg4 · Jun 4, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
@@ -17,14 +17,23 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
+from loguru import logger
+
 from nemo_curator.core.utils import ignore_ray_head_node
 from nemo_curator.tasks import Task
+from nemo_curator.tasks.sentinels import FailedTask, NoneTask
 from nemo_curator.utils.performance_utils import StageTimer
+from nemo_curator.utils.resumability_client import _flush_deltas, _is_active, _skip_completed_sources
 
 if TYPE_CHECKING:
     from nemo_curator.stages.base import ProcessingStage
 
 
+def _is_sentinel(task: Task) -> bool:
+    """A payload-less marker (NoneTask/FailedTask), stripped before the next stage."""
+    return isinstance(task, (NoneTask, FailedTask))
+
+
 @dataclass
 class NodeInfo:
     """Generic node information for setup_on_node calls across backends.
@@ -85,9 +94,20 @@ def process_batch(self, tasks: list[Task]) -> list[Task]:
             # Use the batch processing logic
             results = self.stage.process_batch(tasks)
 
-        # Guarantee every emitted task has a task_id (derived id, or uuid fallback).
+        # Replace a returned None ("filter this slot") with a NoneTask so every
+        # output gets a task_id; sentinels are stripped again below.
+        results = [NoneTask() if r is None else r for r in results]
+
+        # Assign every emitted task a task_id (derived, or uuid fallback).
         results = self._post_process_task_ids(tasks, results)
 
+        # Opt-in resumability: fire per-source deltas (no-op when no actor registered).
+        if _is_active():
+            results = self._apply_resumability_counters(tasks, results)
+
+        # Sentinels never propagate to the next stage.
+        results = [r for r in results if not _is_sentinel(r)]
+
         # Log performance stats and add to result tasks
         _, stage_perf_stats = self._timer.log_stats()
         # Consume and attach any custom metrics recorded by the stage during this call
@@ -100,41 +120,22 @@ def process_batch(self, tasks: list[Task]) -> list[Task]:
         return results
 
     def _post_process_task_ids(self, input_tasks: list[Task], output_tasks: list[Task | None]) -> list[Task]:
-        """Assign a deterministic ``task_id`` to every emitted task.
-
-        This is the single place task ids are assigned — it runs for every
-        stage on every backend (all backend adapters subclass this), so it
-        makes no difference whether a stage defines ``process`` or overrides
-        ``process_batch``. ``task_id`` is the task's id path (parents + own segment); ids are
-        re-derived at each stage boundary so the same object passing through
-        N stages gets N ids.
-
-        The input→output mapping decides each output's PARENT; whether the
-        stage is a source decides each output's SEGMENT (content id vs index)
-        — the two are independent. ``None`` outputs (Curator's "return None to
-        filter") are NOT removed before the length check — keeping them in
-        place preserves positional alignment for filter stages — and are then
-        dropped from the returned list.
-
-        - single input → every output is its child (fan-out): ``parent_<seg>``
-        - ``len(output) == len(input)`` → positional 1:1: each ``parent_i_<seg>``;
-          a ``None`` slot just means input ``i`` was filtered.
-        - any other (ambiguous) cardinality across a batch → a random ``uuid``
-          prefixed with ``"r"`` (e.g. ``"r3f9a…"``), so ``task_id`` is never
-          empty even when a derived id is not possible. The ``"r"`` prefix flags
-          the id as non-deterministic / ancestry-not-tracked (see
-          ``Task.task_id`` docstring).
-
-        ``seg`` is the output's content id (``Task.get_deterministic_id()``)
-        for a source stage when available, else the positional index — so a
-        source partition keeps a stable id across reorderings regardless of
-        whether the source is 1→N or N→N.
-
-        Note: a stage that BOTH filters and fans out within a single batch
-        (returning a flat list rather than a per-input slot) cannot be mapped
-        positionally; if its length happens to equal the input length the 1:1
-        assumption may misattribute parents. That combination is unsupported
-        until per-slot sentinels (NoneTask/FailedTask) land in a later PR.
+        """Assign a deterministic ``task_id`` (parent id + own segment) to every
+        emitted task. Runs once per stage on every backend, so ``process`` vs
+        ``process_batch`` makes no difference; ids are re-derived at each stage
+        boundary, so one object passing through N stages gets N ids.
+
+        - single input → fan-out: each output is ``parent_<seg>``
+        - ``len(output) == len(input)`` → positional 1:1: ``parent_i_<seg>``; a
+          ``None`` slot means input ``i`` was filtered (kept for alignment, then
+          dropped from the result)
+        - any other cardinality → a random ``"r"``-prefixed uuid (non-deterministic,
+          ancestry-not-tracked; see ``Task.task_id``)
+
+        ``seg`` is the content id (``get_deterministic_id()``) for a source stage,
+        else the positional index. A stage that both filters and fans out in one
+        batch can't be mapped positionally and falls to the ``"r"`` case — return
+        one value (or ``None``) per input to stay positional.
         """
         is_source = getattr(self.stage, "is_source_stage", False)
 
@@ -168,6 +169,85 @@ def _post_process_task_ids(self, input_tasks: list[Task], output_tasks: list[Tas
             task.task_id = "r" + uuid.uuid4().hex
         return out
 
+    # Resumability (opt-in): stamp _source_id, fire per-source deltas, drop
+    # completed sources. task_ids are already assigned; sentinels stripped by caller.
+    def _apply_resumability_counters(self, input_tasks: list[Task], output_tasks: list[Task]) -> list[Task]:  # noqa: C901
+        # Dedup key is always an OUTPUT task_id, never the input's: the source
+        # already keyed its +1 on that id, and an output id is one level deeper,
+        # so it's unique to the (task, stage) that produced it.
+        stage = self.stage
+        if getattr(stage, "is_source_stage", False):
+            return self._source_counters(output_tasks)
+
+        # No outputs to key on (filtering uses None->NoneTask, so this is degenerate): skip.
+        if not output_tasks:
+            return output_tasks
+
+        # Pre-source: inputs have no _source_id yet; nothing to track.
+        if all(not t._source_id for t in input_tasks):
+            return output_tasks
+
+        is_sink = stage.is_sink_stage
+        per_task: list[tuple[str, str, int]] = []
+        real = [t for t in output_tasks if not _is_sentinel(t)]
+
+        if len(input_tasks) == 1 and len(output_tasks) != 1:
-        if len(input_tasks) == 1 and len(output_tasks) != 1:
+        if len(input_tasks) == 1 and len(output_tasks) > 1:
-        if len(input_tasks) == 1 and len(output_tasks) != 1:
+        if len(input_tasks) == 1 and len(output_tasks) > 1:
+            # Fan-out (1->N): parent consumed (-1); each real child continues
+            # (+1, or 0 at a sink); each FailedTask keeps the source open (+1);
+            # NoneTask contributes 0.
+            parent = input_tasks[0]
+            n_failed = sum(1 for t in output_tasks if isinstance(t, FailedTask))
+            continuing = 0 if is_sink else len(real)
+            delta = continuing + n_failed - 1
+            # Key on output[0].task_id (not parent.task_id, which collides with the
+            # source's +1). Non-source children are indexed positionally, so
+            # output[0] is always "<parent>_0".
+            per_task.append((output_tasks[0].task_id, parent._source_id, delta))
+            for c in real:
+                if not c._source_id:
+                    c._source_id = parent._source_id
+        elif len(output_tasks) == len(input_tasks):
+            # Positional 1:1; each delta keys on the output id (r.task_id).
+            for parent, r in zip(input_tasks, output_tasks, strict=True):
+                sid = parent._source_id
+                if isinstance(r, NoneTask):  # filtered -> consumed
+                    per_task.append((r.task_id, sid, -1))
+                    continue
+                if isinstance(r, FailedTask):  # failed -> source stays open (no sink test)
+                    per_task.append((r.task_id, sid, 0))
+                    continue
+                per_task.append((r.task_id, sid, -1 if is_sink else 0))  # real: sink -1, else 0
+                if not r._source_id:
+                    r._source_id = sid
+        else:
+            # M->K (M!=K): can't attribute parents; skip (source stays pending -> reprocessed).
+            logger.warning(
+                f"resumability: {type(stage).__name__} produced {len(output_tasks)} outputs "
+                f"for {len(input_tasks)} inputs; can't attribute sources, skipping counter "
+                f"update for this batch."
+            )
+            return output_tasks
+
+        _flush_deltas(per_task)
+        return output_tasks
+
+    def _source_counters(self, output_tasks: list[Task]) -> list[Task]:
+        """Source stage: each output is a source partition; its ``_source_id`` is
+        its own last id segment. Drop already-completed sources; each survivor fires ``+1``."""
+        sources = [t for t in output_tasks if not _is_sentinel(t)]
+        for t in sources:
+            t._source_id = t.task_id.rsplit("_", 1)[-1]
+        completed = _skip_completed_sources([t._source_id for t in sources])
+        per_task: list[tuple[str, str, int]] = []
+        survivors: list[Task] = []
+        for t in sources:
+            if t._source_id in completed:
+                continue
+            per_task.append((t.task_id, t._source_id, +1))
+            survivors.append(t)
+        _flush_deltas(per_task)
+        return survivors
+
     def setup_on_node(self, node_info: NodeInfo | None = None, worker_metadata: WorkerMetadata | None = None) -> None:
         """Setup the stage on a node.
 

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
 from typing import Any
 
 from loguru import logger
@@ -107,8 +108,9 @@ def build(self) -> None:
         # 3. Source / sink defaults: at most one stage may be explicitly
         # marked; if none, the first stage is the source and the last is
         # the sink. The source flag activates content-based ids in the
-        # default ``process_batch``; the sink flag is used by the
-        # resumability layer in a follow-up PR.
+        # default ``process_batch``; the sink flag tells the resumability
+        # counters that a sink consumes its outputs (see
+        # ``BaseStageAdapter._apply_resumability_counters``).
         self._assign_source_sink_roles()
 
     def _assign_source_sink_roles(self) -> None:
@@ -222,18 +224,32 @@ def describe(self) -> str:
 
         return "\n".join(lines)
 
-    def run(self, executor: BaseExecutor | None = None, initial_tasks: list[Task] | None = None) -> list[Task] | None:
+    def run(
+        self,
+        executor: BaseExecutor | None = None,
+        initial_tasks: list[Task] | None = None,
+        checkpoint_path: str | Path | None = None,
+    ) -> list[Task] | None:
         """Run the pipeline.
 
         Args:
             executor (BaseExecutor): Executor to use
             initial_tasks (list[Task], optional): Initial tasks to start the pipeline with. Defaults to None.
+            checkpoint_path (str | Path, optional): Resumability directory. When
+                set, completed source partitions are tracked (in a
+                ``.nemo_curator_metadata`` subdir) and skipped on rerun. Multiple
+                runs (e.g. a SLURM array) may share the directory — each writes
+                its own LMDB file, so there is no contention.
 
         Returns:
             list[Task] | None: List of tasks
         """
         self.build()
 
+        if checkpoint_path is not None:
+            checkpoint_path = Path(checkpoint_path).absolute()
+            checkpoint_path.mkdir(parents=True, exist_ok=True)
+
         if executor is None:
             from nemo_curator.backends.xenna import XennaExecutor
 
@@ -263,4 +279,41 @@ def run(self, executor: BaseExecutor | None = None, initial_tasks: list[Task] |
         if initial_tasks:
             assign_root_task_ids(initial_tasks)
 
-        return executor.execute(self.stages, initial_tasks)
+        if checkpoint_path is None:
+            return executor.execute(self.stages, initial_tasks)
+        return self._run_with_resumability(executor, initial_tasks, checkpoint_path)
+
+    def _run_with_resumability(
+        self,
+        executor: BaseExecutor,
+        initial_tasks: list[Task] | None,
+        checkpoint_path: Path,
+    ) -> list[Task] | None:
+        """Own the resumability-actor lifecycle (executors unmodified): spawn it
+        ``lifetime="detached"`` so it survives executor-local ``ray.shutdown()``,
+        run, then close. The actor never raises, so there's no error path here."""
+        import ray
+
+        from nemo_curator.utils.resumability_actor import ResumabilityActor
+        from nemo_curator.utils.resumability_client import ACTOR_NAME
+
+        ray.init(ignore_reinit_error=True)
+        ResumabilityActor.options(  # type: ignore[attr-defined]
+            name=ACTOR_NAME,
+            lifetime="detached",
+            get_if_exists=True,
+            max_pending_calls=100,
+        ).remote(str(checkpoint_path))
-        ray.init(ignore_reinit_error=True)
-        ResumabilityActor.options(  # type: ignore[attr-defined]
-            name=ACTOR_NAME,
-            lifetime="detached",
-            get_if_exists=True,
-            max_pending_calls=100,
-        ).remote(str(checkpoint_path))
+        ray.init(ignore_reinit_error=True)
+        actor_handle = ResumabilityActor.options(  # type: ignore[attr-defined]
+            name=ACTOR_NAME,
+            lifetime="detached",
+            get_if_exists=True,
+            max_pending_calls=100,
+        ).remote(str(checkpoint_path))
+        # Verify the actor started successfully; surfaces any __init__ exception
+        # (e.g. LMDB open failure) before the pipeline begins so the user is not
+        # left believing checkpointing is active when it silently isn't.
+        ray.get(actor_handle.are_completed.remote([]), timeout=30)  # type: ignore[attr-defined]
 ray.get(actor_handle.wait.remote()) 
-        ray.init(ignore_reinit_error=True)
-        ResumabilityActor.options(  # type: ignore[attr-defined]
-            name=ACTOR_NAME,
-            lifetime="detached",
-            get_if_exists=True,
-            max_pending_calls=100,
-        ).remote(str(checkpoint_path))
+        ray.init(ignore_reinit_error=True)
+        actor_handle = ResumabilityActor.options(  # type: ignore[attr-defined]
+            name=ACTOR_NAME,
+            lifetime="detached",
+            get_if_exists=True,
+            max_pending_calls=100,
+        ).remote(str(checkpoint_path))
+        # Verify the actor started successfully; surfaces any __init__ exception
+        # (e.g. LMDB open failure) before the pipeline begins so the user is not
+        # left believing checkpointing is active when it silently isn't.
+        ray.get(actor_handle.are_completed.remote([]), timeout=30)  # type: ignore[attr-defined]
 ray.get(actor_handle.wait.remote()) 
+
+        try:
+            return executor.execute(self.stages, initial_tasks)
+        finally:
+            # The executor's ray.shutdown() may have run in its own
+            # finally:; reconnect to clean up the detached actor.
+            try:
+                ray.init(ignore_reinit_error=True)
+                actor_handle = ray.get_actor(ACTOR_NAME)
+                ray.get(actor_handle.close.remote(), timeout=10)  # type: ignore[attr-defined]
+                ray.kill(actor_handle)
+            except Exception as e:  # noqa: BLE001
+                logger.warning(f"resumability actor cleanup failed: {e}")
@@ -17,17 +17,19 @@
 from .file_group import FileGroupTask
 from .image import ImageBatch, ImageObject
 from .interleaved import InterleavedBatch
-from .sentinels import EmptyTask, SentinelTask
+from .sentinels import EmptyTask, FailedTask, NoneTask, SentinelTask
 from .tasks import Task
 
 __all__ = [
     "AudioTask",
     "DocumentBatch",
     "EmptyTask",
+    "FailedTask",
     "FileGroupTask",
     "ImageBatch",
     "ImageObject",
     "InterleavedBatch",
+    "NoneTask",
     "SentinelTask",
     "Task",
 ]
@@ -11,11 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Payload-less marker tasks.
-
-``EmptyTask`` seeds a pipeline (the implicit root id ``"0"``). All markers
-share the :class:`SentinelTask` base and carry no payload (``data is None``).
-Construct one with ``EmptyTask()``.
+"""Payload-less marker tasks on a shared :class:`SentinelTask` base:
+``EmptyTask`` (pipeline seed, root id ``"0"``), ``NoneTask`` (filtered slot;
+counter decrements), ``FailedTask`` (failed slot; counter unchanged so its
+source stays pending and reruns). All carry no data, get a framework-assigned
+``task_id``, and are stripped before the next stage.
 """
 
 from dataclasses import dataclass, field
@@ -25,8 +25,7 @@
 
 @dataclass
 class SentinelTask(Task[None]):
-    """Base for payload-less marker tasks. Always carries no data; ``task_id``
-    is framework-assigned like any other task."""
+    """Base for payload-less marker tasks: no data, framework-assigned ``task_id``."""
 
     data: None = None
 
@@ -44,11 +43,22 @@ def validate(self) -> bool:
 
 @dataclass
 class EmptyTask(SentinelTask):
-    """Payload-less task that seeds a pipeline. Its ``task_id`` is fixed to
-    ``"0"`` — the implicit root every task in a run descends from, so all
-    ``task_id``s share the ``"0"`` prefix (source partitions become
-    ``"0_<id>"``, user-provided initial tasks become ``"0_0"``, ``"0_1"``, …).
-    """
+    """Seeds a pipeline with ``task_id="0"`` — the implicit root every task
+    descends from (so all ids share the ``"0"`` prefix)."""
 
     dataset_name: str = "empty"
     task_id: str = field(init=False, default="0")
+
+
+@dataclass
+class NoneTask(SentinelTask):
+    """Marks a slot as intentionally filtered (resumability counter decrements)."""
+
+    dataset_name: str = "none"
+
+
+@dataclass
+class FailedTask(SentinelTask):
+    """Marks a slot as failed → retried on resume (counter does NOT decrement)."""
+
+    dataset_name: str = "failed"
@@ -46,13 +46,17 @@ class Task(ABC, Generic[T]):
             NON-deterministic (differ across runs).
         dataset_name: Name of the dataset this task belongs to.
         _stage_perf: List of stages perfs this task has passed through.
+        _source_id: Source (input partition) this task descends from. Stamped at
+            the source stage, inherited downstream; used only by the opt-in
+            resumability layer. Empty for pre-source tasks.
     """
 
     dataset_name: str
     data: T
     _stage_perf: list[StagePerfStats] = field(default_factory=list)
     _metadata: dict[str, Any] = field(default_factory=dict)
     task_id: str = field(init=False, default="")
+    _source_id: str = field(init=False, default="")
 
     def __post_init__(self) -> None:
         """Post-initialization hook."""