NVIDIA-NeMo · abhinavg4 · Jun 4, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
@@ -17,14 +17,24 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
+from loguru import logger
+
 from nemo_curator.core.utils import ignore_ray_head_node
 from nemo_curator.tasks import Task
+from nemo_curator.tasks.sentinels import FailedTask, NoneTask
 from nemo_curator.utils.performance_utils import StageTimer
+from nemo_curator.utils.resumability_client import _flush_deltas, _is_active, _skip_completed_sources
 
 if TYPE_CHECKING:
     from nemo_curator.stages.base import ProcessingStage
 
 
+def _is_sentinel(task: Task) -> bool:
+    """A payload-less marker (NoneTask/FailedTask) that is stripped before the
+    next stage rather than propagated."""
+    return isinstance(task, (NoneTask, FailedTask))
+
+
 @dataclass
 class NodeInfo:
     """Generic node information for setup_on_node calls across backends.
@@ -85,9 +95,23 @@ def process_batch(self, tasks: list[Task]) -> list[Task]:
             # Use the batch processing logic
             results = self.stage.process_batch(tasks)
 
+        # A returned ``None`` ("filter this slot") becomes a NoneTask so every
+        # output is a real Task that gets a task_id. Sentinels (NoneTask /
+        # FailedTask) carry no identity and are stripped again before this
+        # method returns.
+        results = [NoneTask() if r is None else r for r in results]
+
         # Guarantee every emitted task has a task_id (derived id, or uuid fallback).
         results = self._post_process_task_ids(tasks, results)
 
+        # Opt-in resumability: fire per-source counter deltas. A no-op (the
+        # client helpers self-disable) when no resumability actor is registered.
+        if _is_active():
+            results = self._apply_resumability_counters(tasks, results)
+
+        # Sentinels never propagate to the next stage.
+        results = [r for r in results if not _is_sentinel(r)]
+
         # Log performance stats and add to result tasks
         _, stage_perf_stats = self._timer.log_stats()
         # Consume and attach any custom metrics recorded by the stage during this call
@@ -168,6 +192,108 @@ def _post_process_task_ids(self, input_tasks: list[Task], output_tasks: list[Tas
             task.task_id = "r" + uuid.uuid4().hex
         return out
 
+    # ------------------------------------------------------------------ #
+    # Resumability (opt-in). Runs only when a resumability actor is
+    # registered. task_ids are already assigned by _post_process_task_ids;
+    # this layer only stamps _source_id, fires per-source counter deltas, and
+    # drops already-completed sources. Sentinels are stripped by the caller.
+    # ------------------------------------------------------------------ #
+    def _apply_resumability_counters(self, input_tasks: list[Task], output_tasks: list[Task]) -> list[Task]:  # noqa: C901
+        # Every delta's dedup key is an OUTPUT task_id, never an input's
+        # (``parent.task_id``). The source fires ``+1`` keyed on its output
+        # partition's id; that id is the *input* id of the next stage, so keying
+        # a downstream delta on the input would reuse the source's key and the
+        # actor would treat the two as one conflicting event. An output id is
+        # always one level deeper, so it's unique to the (task, stage) that
+        # produced it.
+        stage = self.stage
+        if getattr(stage, "is_source_stage", False):
+            return self._source_counters(output_tasks)
+
+        # No outputs at all. Filtering is expressed as None -> NoneTask (a kept
+        # slot), so a stage that emits nothing is degenerate; there is no output
+        # to key a delta on, so skip (like the ambiguous-cardinality case).
+        if not output_tasks:
+            return output_tasks
+
+        # Pre-source stages: inputs carry no _source_id, so there's nothing to
+        # track yet. Leave outputs untouched.
+        if all(not t._source_id for t in input_tasks):
+            return output_tasks
+
+        is_sink = stage.is_sink_stage
+        per_task: list[tuple[str, str, int]] = []
+        real = [t for t in output_tasks if not _is_sentinel(t)]
+
+        if len(input_tasks) == 1 and len(output_tasks) != 1:
-        if len(input_tasks) == 1 and len(output_tasks) != 1:
+        if len(input_tasks) == 1 and len(output_tasks) > 1:
-        if len(input_tasks) == 1 and len(output_tasks) != 1:
+        if len(input_tasks) == 1 and len(output_tasks) > 1:
+            # Genuine fan-out (1 -> N, N != 1). One net delta for the parent it
+            # is consumed (-1); each real child continues (+1) unless this is a
+            # sink, where children leave the pipeline (0); each FailedTask keeps
+            # the source open (+1); NoneTask contributes nothing. sink and
+            # fan-out are independent, so the sink test applies here too.
+            parent = input_tasks[0]
+            n_failed = sum(1 for t in output_tasks if isinstance(t, FailedTask))
+            continuing = 0 if is_sink else len(real)
+            delta = continuing + n_failed - 1
+            # Key on output_tasks[0].task_id (NOT parent.task_id, which collides
+            # with the source's +1). It always ends in "_0": get_deterministic_id()
+            # is consulted only for source stages (which return via
+            # _source_counters and never reach here), so non-source children are
+            # indexed positionally (suffix 0, 1, ...) -> output[0] is "<parent>_0".
+            per_task.append((output_tasks[0].task_id, parent._source_id, delta))
+            for c in real:
+                if not c._source_id:
+                    c._source_id = parent._source_id
+        elif len(output_tasks) == len(input_tasks):
+            # Positional 1:1, including filtered (NoneTask) / failed slots. Each
+            # delta keys on the OUTPUT id (r.task_id).
+            for parent, r in zip(input_tasks, output_tasks, strict=True):
+                sid = parent._source_id
+                if isinstance(r, NoneTask):
+                    # Filtered: this slot is consumed.
+                    per_task.append((r.task_id, sid, -1))
+                    continue
+                if isinstance(r, FailedTask):
+                    # Failed: leave the source open so it reruns (no sink test).
+                    per_task.append((r.task_id, sid, 0))
+                    continue
+                # Real: a sink consumes it (-1); otherwise it passes through (0).
+                per_task.append((r.task_id, sid, -1 if is_sink else 0))
+                if not r._source_id:
+                    r._source_id = sid
+        else:
+            # M inputs -> K outputs (K != M): the parent of each output can't be
+            # determined, so the counter can't be updated correctly. Skip
+            # (the source counter stays pending -> reprocessed on resume).
+            logger.warning(
+                f"resumability: {type(stage).__name__} produced {len(output_tasks)} outputs "
+                f"for {len(input_tasks)} inputs; can't attribute sources, skipping counter "
+                f"update for this batch."
+            )
+            return output_tasks
+
+        _flush_deltas(per_task)
+        return output_tasks
+
+    def _source_counters(self, output_tasks: list[Task]) -> list[Task]:
+        """Source stage: each output is a source partition. Its ``_source_id``
+        is its own (last) id segment — the content id or index assigned by
+        ``_post_process_task_ids``. Already-completed sources are dropped; each
+        surviving source fires a ``+1``."""
+        sources = [t for t in output_tasks if not _is_sentinel(t)]
+        for t in sources:
+            t._source_id = t.task_id.rsplit("_", 1)[-1]
+        completed = _skip_completed_sources([t._source_id for t in sources])
+        per_task: list[tuple[str, str, int]] = []
+        survivors: list[Task] = []
+        for t in sources:
+            if t._source_id in completed:
+                continue
+            per_task.append((t.task_id, t._source_id, +1))
+            survivors.append(t)
+        _flush_deltas(per_task)
+        return survivors
+
     def setup_on_node(self, node_info: NodeInfo | None = None, worker_metadata: WorkerMetadata | None = None) -> None:
         """Setup the stage on a node.
 

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
 from typing import Any
 
 from loguru import logger
@@ -222,18 +223,35 @@ def describe(self) -> str:
 
         return "\n".join(lines)
 
-    def run(self, executor: BaseExecutor | None = None, initial_tasks: list[Task] | None = None) -> list[Task] | None:
+    def run(
+        self,
+        executor: BaseExecutor | None = None,
+        initial_tasks: list[Task] | None = None,
+        checkpoint_path: str | Path | None = None,
+    ) -> list[Task] | None:
         """Run the pipeline.
 
         Args:
             executor (BaseExecutor): Executor to use
             initial_tasks (list[Task], optional): Initial tasks to start the pipeline with. Defaults to None.
+            checkpoint_path (str | Path, optional): Directory used for
+                resumability. When set, completed source partitions are tracked
+                across runs and skipped on rerun; the tracking state lives in a
+                ``.nemo_curator_metadata`` subdirectory. Multiple independent
+                runs (e.g. the tasks of a SLURM array) may point at the same
+                directory — each writes its own LMDB file, so there is no
+                shared-file contention. The actor lifecycle is owned by this
+                method; executors are not modified.
 
         Returns:
             list[Task] | None: List of tasks
         """
         self.build()
 
+        if checkpoint_path is not None:
+            checkpoint_path = Path(checkpoint_path).absolute()
+            checkpoint_path.mkdir(parents=True, exist_ok=True)
+
         if executor is None:
             from nemo_curator.backends.xenna import XennaExecutor
 
@@ -263,4 +281,46 @@ def run(self, executor: BaseExecutor | None = None, initial_tasks: list[Task] |
         if initial_tasks:
             assign_root_task_ids(initial_tasks)
 
-        return executor.execute(self.stages, initial_tasks)
+        if checkpoint_path is None:
+            return executor.execute(self.stages, initial_tasks)
+        return self._run_with_resumability(executor, initial_tasks, checkpoint_path)
+
+    def _run_with_resumability(
+        self,
+        executor: BaseExecutor,
+        initial_tasks: list[Task] | None,
+        checkpoint_path: Path,
+    ) -> list[Task] | None:
+        """Owns the full resumability-actor lifecycle. Per-backend executors
+        are not modified — the actor is spawned ``lifetime="detached"`` so
+        it survives executor-local ``ray.shutdown()`` calls.
+
+        The actor never raises (see ``ResumabilityActor.apply_deltas``), so
+        there's no watchdog and no error propagation path here — just spawn,
+        run, close.
+        """
+        import ray
+
+        from nemo_curator.utils.resumability_actor import ResumabilityActor
+        from nemo_curator.utils.resumability_client import ACTOR_NAME
+
+        ray.init(ignore_reinit_error=True)
+        ResumabilityActor.options(  # type: ignore[attr-defined]
+            name=ACTOR_NAME,
+            lifetime="detached",
+            get_if_exists=True,
+            max_pending_calls=100,
+        ).remote(str(checkpoint_path))
-        ray.init(ignore_reinit_error=True)
-        ResumabilityActor.options(  # type: ignore[attr-defined]
-            name=ACTOR_NAME,
-            lifetime="detached",
-            get_if_exists=True,
-            max_pending_calls=100,
-        ).remote(str(checkpoint_path))
+        ray.init(ignore_reinit_error=True)
+        actor_handle = ResumabilityActor.options(  # type: ignore[attr-defined]
+            name=ACTOR_NAME,
+            lifetime="detached",
+            get_if_exists=True,
+            max_pending_calls=100,
+        ).remote(str(checkpoint_path))
+        # Verify the actor started successfully; surfaces any __init__ exception
+        # (e.g. LMDB open failure) before the pipeline begins so the user is not
+        # left believing checkpointing is active when it silently isn't.
+        ray.get(actor_handle.are_completed.remote([]), timeout=30)  # type: ignore[attr-defined]
 ray.get(actor_handle.wait.remote()) 
-        ray.init(ignore_reinit_error=True)
-        ResumabilityActor.options(  # type: ignore[attr-defined]
-            name=ACTOR_NAME,
-            lifetime="detached",
-            get_if_exists=True,
-            max_pending_calls=100,
-        ).remote(str(checkpoint_path))
+        ray.init(ignore_reinit_error=True)
+        actor_handle = ResumabilityActor.options(  # type: ignore[attr-defined]
+            name=ACTOR_NAME,
+            lifetime="detached",
+            get_if_exists=True,
+            max_pending_calls=100,
+        ).remote(str(checkpoint_path))
+        # Verify the actor started successfully; surfaces any __init__ exception
+        # (e.g. LMDB open failure) before the pipeline begins so the user is not
+        # left believing checkpointing is active when it silently isn't.
+        ray.get(actor_handle.are_completed.remote([]), timeout=30)  # type: ignore[attr-defined]
 ray.get(actor_handle.wait.remote()) 
+
+        try:
+            return executor.execute(self.stages, initial_tasks)
+        finally:
+            # The executor's ray.shutdown() may have run in its own
+            # finally:; reconnect to clean up the detached actor.
+            try:
+                ray.init(ignore_reinit_error=True)
+                actor_handle = ray.get_actor(ACTOR_NAME)
+                ray.get(actor_handle.close.remote(), timeout=10)  # type: ignore[attr-defined]
+                ray.kill(actor_handle)
+            except Exception as e:  # noqa: BLE001
+                logger.warning(f"resumability actor cleanup failed: {e}")
@@ -17,17 +17,19 @@
 from .file_group import FileGroupTask
 from .image import ImageBatch, ImageObject
 from .interleaved import InterleavedBatch
-from .sentinels import EmptyTask, SentinelTask
+from .sentinels import EmptyTask, FailedTask, NoneTask, SentinelTask
 from .tasks import Task
 
 __all__ = [
     "AudioTask",
     "DocumentBatch",
     "EmptyTask",
+    "FailedTask",
     "FileGroupTask",
     "ImageBatch",
     "ImageObject",
     "InterleavedBatch",
+    "NoneTask",
     "SentinelTask",
     "Task",
 ]
@@ -13,9 +13,18 @@
 # limitations under the License.
 """Payload-less marker tasks.
 
-``EmptyTask`` seeds a pipeline (the implicit root id ``"0"``). All markers
-share the :class:`SentinelTask` base and carry no payload (``data is None``).
-Construct one with ``EmptyTask()``.
+``EmptyTask`` seeds a pipeline (the implicit root id ``"0"``). The resumability
+layer adds two more markers on the same :class:`SentinelTask` base:
+
+- ``NoneTask`` — this slot was intentionally filtered. The resumability counter
- ``NoneTask`` — this slot was intentionally filtered. The resumability counter
+- ``NoneTask`` - this task was intentionally filtered. The resumability counter
- ``NoneTask`` — this slot was intentionally filtered. The resumability counter
+- ``NoneTask`` - this task was intentionally filtered. The resumability counter
+  treats it as a consumed branch (decrements). The adapter auto-wraps a
+  returned ``None`` as a ``NoneTask``.
+- ``FailedTask`` — this slot failed and should be retried on resume. The counter
- ``FailedTask`` — this slot failed and should be retried on resume. The counter
+- ``FailedTask`` — this slot failed and should be retried on resume. The resumability counter
- ``FailedTask`` — this slot failed and should be retried on resume. The counter
+- ``FailedTask`` — this slot failed and should be retried on resume. The resumability counter
+  is NOT decremented, so its source stays pending and reruns.
+
+All carry no payload (``data is None``) and get their ``task_id`` assigned by
+the executor adapter; sentinels are stripped before the next stage. Construct
+with ``EmptyTask()`` / ``NoneTask()`` / ``FailedTask()``.
 """
 
 from dataclasses import dataclass, field
@@ -52,3 +61,17 @@ class EmptyTask(SentinelTask):
 
     dataset_name: str = "empty"
     task_id: str = field(init=False, default="0")
+
+
+@dataclass
+class NoneTask(SentinelTask):
+    """Marks a slot as intentionally filtered (resumability counter decrements)."""
+
+    dataset_name: str = "none"
+
+
+@dataclass
+class FailedTask(SentinelTask):
+    """Marks a slot as failed → retried on resume (counter does NOT decrement)."""
+
+    dataset_name: str = "failed"
@@ -46,13 +46,18 @@ class Task(ABC, Generic[T]):
             NON-deterministic (differ across runs).
         dataset_name: Name of the dataset this task belongs to.
         _stage_perf: List of stages perfs this task has passed through.
+        _source_id: Identifier of the source (input partition) this task
+            descends from. Stamped at the source stage and inherited
+            downstream; used only by the (opt-in) resumability layer to
+            track which sources have completed. Empty for pre-source tasks.
     """
 
     dataset_name: str
     data: T
     _stage_perf: list[StagePerfStats] = field(default_factory=list)
     _metadata: dict[str, Any] = field(default_factory=dict)
     task_id: str = field(init=False, default="")
+    _source_id: str = field(init=False, default="")
 
     def __post_init__(self) -> None:
         """Post-initialization hook."""