Merge branch 'main' into slurm_array

sarahyurick · web-flow · commit be338eafe45e · 2026-06-29T12:39:18.000-07:00
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -62,7 +62,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        folder: ["backends", "config", "core", "models", "pipelines", "stages-audio", "stages-common", "stages-deduplication", "stages-image", "stages-interleaved", "stages-math_stages", "stages-synthetic", "stages-text", "stages-video", "eval", "tasks", "utils"]
+        folder: ["backends", "benchmarking", "config", "core", "models", "pipelines", "stages-audio", "stages-common", "stages-deduplication", "stages-image", "stages-interleaved", "stages-math_stages", "stages-synthetic", "stages-text", "stages-video", "eval", "tasks", "utils"]
     needs: [pre-flight, cicd-wait-in-queue]
     runs-on: ubuntu-latest
     name: Unit_Test_${{ matrix.folder }}_CPU
diff --git a/benchmarking/README.md b/benchmarking/README.md
@@ -145,9 +145,13 @@ paths:
     host_path: /path/to/model_weights
     container_path: /model_weights  # optional override
 
-# Optional: Global timeout for all entries (seconds)
+# Optional: Global timeout for entries that omit timeout_s (seconds)
 default_timeout_s: 7200
 
+# Optional: Maximum allowed effective timeout for any entry (seconds).
+# Defaults to 14340 (3h59m).
+max_timeout_s: 14340
+
 # Optional: Delete scratch directories after each entry completes
 # The path {session_entry_dir}/scratch is automatically created when an entry starts and can be used by benchmark
 #scripts for writing temp files. This directory is automatically cleaned up on completion of the entry if
diff --git a/benchmarking/nightly-benchmark.yaml b/benchmarking/nightly-benchmark.yaml
@@ -127,9 +127,11 @@ datasets:
       path: "{datasets_path}/fleurs"
 # Timeout knobs consumed by tools/generate_ci_tests.py to compute each SLURM job's wall-clock:
 #   max(entry.timeout_s (or default_timeout_s) + cleanup_timeout_s, min_timeout_s)
-# default_timeout_s: per-entry default; cleanup_timeout_s: post-run cleanup buffer; min_timeout_s:
-# floor covering container setup. (default_timeout_s is also the runner's per-entry default.)
+# default_timeout_s: per-entry default; max_timeout_s: per-entry ceiling after config merging;
+# cleanup_timeout_s: post-run cleanup buffer; min_timeout_s: floor covering container setup.
+# max_timeout_s is 3h59m so a max-length entry plus the 60s cleanup buffer fits in 4h.
 default_timeout_s: 7200
+max_timeout_s: 14340
 cleanup_timeout_s: 60
 min_timeout_s: 600
 
@@ -739,10 +741,10 @@ entries:
   - name: audio_readspeech_xenna
     enabled: true
     script: audio_readspeech_benchmark.py
-    # Hang guard only (~4h). Full dataset with all filters runs ~3.24h on 4×A100.
+    # Hang guard only (3h30m). Full dataset with all filters runs ~3.24h on 4×A100.
     # Performance regressions should be tracked via a `requirements` check on
     # `time_taken_s` once a stable baseline is established (follow-up).
-    timeout_s: 14400
+    timeout_s: 12600
     args: >-
       --benchmark-results-path={session_entry_dir}
       --scratch-output-path={session_entry_dir}/scratch
@@ -775,9 +777,9 @@ entries:
   - name: audio_readspeech_raydata
     enabled: true
     script: audio_readspeech_benchmark.py
-    # Hang guard only (~4h). Mirrors xenna config so the two executors are
+    # Hang guard only (3h30m). Mirrors xenna config so the two executors are
     # directly comparable on the same workload (full dataset, all filters).
-    timeout_s: 14400
+    timeout_s: 12600
     args: >-
       --benchmark-results-path={session_entry_dir}
       --scratch-output-path={session_entry_dir}/scratch
diff --git a/benchmarking/runner/session.py b/benchmarking/runner/session.py
@@ -41,6 +41,9 @@ class Session:
     entries: list[Entry] = field(default_factory=list)
     sinks: list[Sink] = field(default_factory=list)
     default_timeout_s: int = 7200
+    # Maximum allowed per-entry timeout after default_timeout_s has been applied.
+    # 3h59m keeps generated CI wall-clock below common 4h limits once cleanup time is added.
+    max_timeout_s: int = 14340
     # object store size is either a value in bytes (int), a fraction of total system memory (float), or None or the
     # value "default" (string) both representing the default object store size as used by "ray start".
     object_store_size: int | float | str | None = 0.5
@@ -55,7 +58,7 @@ class Session:
     path_resolver: PathResolver = None
     dataset_resolver: DatasetResolver = None
 
-    def __post_init__(self) -> None:  # noqa: C901
+    def __post_init__(self) -> None:  # noqa: C901, PLR0912
         """Post-initialization checks and updates for dataclass."""
         names = [entry.name for entry in self.entries]
         if len(names) != len(set(names)):
@@ -75,15 +78,27 @@ def __post_init__(self) -> None:  # noqa: C901
             )
             raise ValueError(msg)
 
+        if not isinstance(self.max_timeout_s, int) or isinstance(self.max_timeout_s, bool) or self.max_timeout_s <= 0:
+            msg = f"Invalid max_timeout_s: {self.max_timeout_s}; must be a positive integer."
+            raise ValueError(msg)
+
         # Update delete_scratch for each entry that has not been set to the session-level delete_scratch setting
         for entry in self.entries:
             if entry.delete_scratch is None:
                 entry.delete_scratch = self.delete_scratch
 
-        # Update timeout_s for each entry that has not been set to the session-level default_timeout_s
+        # Update timeout_s for each entry that has not been set to the session-level
+        # default_timeout_s, then enforce the session-level maximum against effective values.
         for entry in self.entries:
             if entry.timeout_s is None:
                 entry.timeout_s = self.default_timeout_s
+            if entry.timeout_s > self.max_timeout_s:
+                msg = (
+                    f"Entry '{entry.name}' has timeout_s={entry.timeout_s}, which exceeds "
+                    f"max_timeout_s={self.max_timeout_s}. Entry timeouts are validated after "
+                    "all YAML files have been merged and default_timeout_s has been applied."
+                )
+                raise ValueError(msg)
 
         # Update object store size for each entry that has not been set.
         for entry in self.entries:
diff --git a/benchmarking/tools/generate_ci_tests.py b/benchmarking/tools/generate_ci_tests.py
@@ -28,6 +28,8 @@
 # Fallbacks used only when nightly-benchmark.yaml omits the corresponding key; the YAML config
 # is the source of truth for these timeout knobs.
 DEFAULT_TIMEOUT_S = 7200  # mirrors Session.default_timeout_s in runner/session.py
+# Slurm caps CI jobs at 4h, so this default leaves room for the 60s cleanup buffer.
+DEFAULT_MAX_TIMEOUT_S = 14340  # mirrors Session.max_timeout_s in runner/session.py
 DEFAULT_CLEANUP_TIMEOUT_S = 60
 DEFAULT_MIN_TIMEOUT_S = 600
 
@@ -70,7 +72,14 @@ def session_name_from_env(env: Mapping[str, str] = os.environ) -> str | None:
     return None
 
 
-def generate_job(entry: dict, scope: str, default_timeout_s: int, cleanup_timeout_s: int, min_timeout_s: int) -> dict:
+def generate_job(  # noqa: PLR0913
+    entry: dict,
+    scope: str,
+    default_timeout_s: int,
+    cleanup_timeout_s: int,
+    min_timeout_s: int,
+    max_timeout_s: int,
+) -> dict:
     """
     Generate a GitLab CI job for a single benchmark entry.
 
@@ -80,14 +89,20 @@ def generate_job(entry: dict, scope: str, default_timeout_s: int, cleanup_timeou
         default_timeout_s: Timeout used for entries that omit "timeout_s"
         cleanup_timeout_s: Buffer added on top of every entry's timeout for post-run cleanup
         min_timeout_s: Floor on the generated job time to cover container setup overhead
+        max_timeout_s: Maximum allowed effective entry timeout before cleanup time is added
 
     Returns:
         job: Dictionary defining the GitLab CI job
     """
     ray = entry.get("ray", {})
+    entry_timeout_s = entry.get("timeout_s", default_timeout_s)
+    if entry_timeout_s > max_timeout_s:
+        msg = f"Entry '{entry['name']}' has timeout_s={entry_timeout_s}, which exceeds max_timeout_s={max_timeout_s}"
+        raise ValueError(msg)
+
     # SLURM wall-clock = entry's effective timeout + a fixed cleanup buffer, floored at
     # min_timeout_s so short entries get enough time for container setup before their run starts.
-    timeout_s = max(entry.get("timeout_s", default_timeout_s) + cleanup_timeout_s, min_timeout_s)
+    timeout_s = max(entry_timeout_s + cleanup_timeout_s, min_timeout_s)
     time_str = seconds_to_time(timeout_s)
 
     return {
@@ -124,6 +139,7 @@ def generate_pipeline(curator_dir: str, scope: str, session_name: str | None = N
     default_timeout_s = config.get("default_timeout_s", DEFAULT_TIMEOUT_S)
     cleanup_timeout_s = config.get("cleanup_timeout_s", DEFAULT_CLEANUP_TIMEOUT_S)
     min_timeout_s = config.get("min_timeout_s", DEFAULT_MIN_TIMEOUT_S)
+    max_timeout_s = config.get("max_timeout_s", DEFAULT_MAX_TIMEOUT_S)
 
     pipeline = {
         "include": ["curator/curator_ci_template.yml"],
@@ -137,7 +153,9 @@ def generate_pipeline(curator_dir: str, scope: str, session_name: str | None = N
         if not entry.get("enabled", True):
             continue
 
-        pipeline[entry["name"]] = generate_job(entry, scope, default_timeout_s, cleanup_timeout_s, min_timeout_s)
+        pipeline[entry["name"]] = generate_job(
+            entry, scope, default_timeout_s, cleanup_timeout_s, min_timeout_s, max_timeout_s
+        )
         job_count += 1
 
     if job_count == 0:
diff --git a/nemo_curator/backends/ray_data/adapter.py b/nemo_curator/backends/ray_data/adapter.py
@@ -73,6 +73,23 @@ def _process_batch_internal(self, batch: dict[str, Any]) -> dict[str, Any]:
         # For Task objects, we return them in the 'item' column
         return {"item": results}
 
+    def _build_resource_kwargs(self, ray_stage_spec: dict) -> dict[str, float]:
+        """Build num_cpus/num_gpus kwargs for map_batches.
+
+        Checks ray_stage_spec for RAY_NUM_CPUS first so stages can request a
+        different CPU reservation for Ray Data (e.g. cpus=1.0 to enable stage
+        fusion) without changing resources.cpus used by other executors.
+        """
+        kwargs: dict[str, float] = {}
+        ray_num_cpus = ray_stage_spec.get(RayStageSpecKeys.RAY_NUM_CPUS)
+        if ray_num_cpus is not None:
+            kwargs["num_cpus"] = ray_num_cpus  # type: ignore[reportArgumentType]
+        elif self.stage.resources.cpus > 0:
+            kwargs["num_cpus"] = self.stage.resources.cpus  # type: ignore[reportArgumentType]
+        if self.stage.resources.gpus > 0:
+            kwargs["num_gpus"] = self.stage.resources.gpus  # type: ignore[reportArgumentType]
+        return kwargs
+
     def process_dataset(self, dataset: Dataset) -> Dataset:
         """Process a Ray Data dataset through this stage.
 
@@ -107,10 +124,7 @@ def process_dataset(self, dataset: Dataset) -> Dataset:
             if max_calls is not None:
                 map_batches_kwargs["max_calls"] = max_calls
 
-        if self.stage.resources.cpus > 0:
-            map_batches_kwargs["num_cpus"] = self.stage.resources.cpus  # type: ignore[reportArgumentType]
-        if self.stage.resources.gpus > 0:
-            map_batches_kwargs["num_gpus"] = self.stage.resources.gpus  # type: ignore[reportArgumentType]
+        map_batches_kwargs.update(self._build_resource_kwargs(ray_stage_spec))
 
         # Per-stage ray_remote_args (e.g. runtime_env with different pip versions per stage).
         ray_remote_args = copy.deepcopy(ray_stage_spec.get(RayStageSpecKeys.RAY_REMOTE_ARGS) or {})
diff --git a/nemo_curator/backends/utils.py b/nemo_curator/backends/utils.py
@@ -134,6 +134,7 @@ class RayStageSpecKeys(str, Enum):
     MAX_WORKERS = "max_workers"
     INITIAL_WORKERS = "initial_workers"
     RAY_REMOTE_ARGS = "ray_remote_args"
+    RAY_NUM_CPUS = "ray_num_cpus"
 
 
 def get_worker_metadata_and_node_id() -> tuple[NodeInfo, WorkerMetadata]:
diff --git a/nemo_curator/stages/video/clipping/clip_extraction_stages.py b/nemo_curator/stages/video/clipping/clip_extraction_stages.py
@@ -38,7 +38,7 @@ class ClipTranscodingStage(ProcessingStage[VideoTask, VideoTask]):
     software (libx264, libopenh264) and hardware (NVENC) encoding with configurable parameters.
 
     Args:
-        num_cpus_per_worker: Number of CPUs per worker.
+        num_cpus_per_worker: Number of CPUs per worker for Xenna scheduling. Does not affect Ray Data CPU scheduling; use ray_data_num_cpus for that.
         encoder: Video encoder to use.
         encoder_threads: Number of threads per encoder.
         encode_batch_size: Number of clips to encode in parallel.
@@ -48,6 +48,7 @@ class ClipTranscodingStage(ProcessingStage[VideoTask, VideoTask]):
         num_clips_per_chunk: Number of clips per chunk. If the number of clips is larger than this, the clips will be split into chunks, and created VideoTasks for each chunk.
         verbose: Whether to print verbose logs.
         ffmpeg_verbose: Whether to print FFmpeg verbose logs.
+        ray_data_num_cpus: CPU cores reserved per Ray Data actor for this stage. Defaults to 1.0 on the CPU encoder path to enable stage fusion with upstream stages. Set to None to fall back to resources.cpus. Does not affect Xenna scheduling.
     """
 
     num_cpus_per_worker: float = 6.0
@@ -61,6 +62,9 @@ class ClipTranscodingStage(ProcessingStage[VideoTask, VideoTask]):
     ffmpeg_verbose: bool = False
     verbose: bool = False
     name: str = "clip_transcoding"
+    ray_data_num_cpus: float | None = (
+        None  # CPU reservation for Ray Data scheduler; set to 1.0 on CPU path to enable stage fusion
+    )
 
     def setup(self, worker_metadata: WorkerMetadata | None = None) -> None:  # noqa: ARG002
         """Setup method called once before processing begins.
@@ -83,6 +87,11 @@ def __post_init__(self) -> None:
                 self.resources = Resources(gpus=1)
         else:
             self.resources = Resources(cpus=self.num_cpus_per_worker)
+            if self.ray_data_num_cpus is None:
+                # Default to 1.0 so Ray Data fuses this stage with VideoReaderStage
+                # and FixedStrideExtractorStage. Kept separate from resources.cpus
+                # so Xenna scheduling is unaffected.
+                self.ray_data_num_cpus = 1.0
 
     def inputs(self) -> tuple[list[str], list[str]]:
         return ["data"], ["source_bytes"]
@@ -92,9 +101,10 @@ def outputs(self) -> tuple[list[str], list[str]]:
 
     def ray_stage_spec(self) -> dict[str, Any]:
         """Ray stage specification for this stage."""
-        return {
-            RayStageSpecKeys.IS_FANOUT_STAGE: True,
-        }
+        spec: dict[str, Any] = {RayStageSpecKeys.IS_FANOUT_STAGE: True}
+        if self.ray_data_num_cpus is not None:
+            spec[RayStageSpecKeys.RAY_NUM_CPUS] = self.ray_data_num_cpus
+        return spec
 
     def process(self, task: VideoTask) -> VideoTask:
         video = task.data
diff --git a/nemo_curator/stages/video/clipping/video_frame_extraction.py b/nemo_curator/stages/video/clipping/video_frame_extraction.py
@@ -15,12 +15,14 @@
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
 from loguru import logger
 
 from nemo_curator.backends.base import WorkerMetadata
+from nemo_curator.backends.utils import RayStageSpecKeys
 from nemo_curator.stages.base import ProcessingStage
 from nemo_curator.stages.resources import Resources
 from nemo_curator.tasks.video import VideoTask
@@ -100,6 +102,9 @@ class VideoFrameExtractionStage(ProcessingStage[VideoTask, VideoTask]):
     decoder_mode: str = "pynvc"
     verbose: bool = False
     name: str = "video_frame_extraction"
+    ray_data_num_cpus: float | None = (
+        None  # CPU reservation for Ray Data scheduler; set to 1.0 on CPU path to enable stage fusion
+    )
 
     def inputs(self) -> tuple[list[str], list[str]]:
         return ["data"], []
@@ -130,6 +135,16 @@ def __post_init__(self) -> None:
             self.resources = Resources(gpu_memory_gb=10)
         else:
             self.resources = Resources(cpus=4.0)
+            if self.ray_data_num_cpus is None:
+                # Default to 1.0 so Ray Data fuses this stage with VideoReaderStage.
+                # Kept separate from resources.cpus so Xenna scheduling is unaffected.
+                self.ray_data_num_cpus = 1.0
+
+    def ray_stage_spec(self) -> dict[str, Any]:
+        """Ray stage specification for this stage."""
+        if self.ray_data_num_cpus is not None:
+            return {RayStageSpecKeys.RAY_NUM_CPUS: self.ray_data_num_cpus}
+        return {}
 
     def process(self, task: VideoTask) -> VideoTask:
         width, height = self.output_hw
diff --git a/tests/backends/ray_data/test_adapter.py b/tests/backends/ray_data/test_adapter.py
@@ -130,6 +130,16 @@ def test_process_dataset_rejects_managed_ray_remote_args(self):
         with pytest.raises(ValueError, match="must not override Curator-managed map_batches arguments"):
             _map_batches_kwargs(stage)
 
+    def test_build_resource_kwargs_uses_ray_num_cpus_from_spec_over_resources_cpus(self):
+        stage = ConfigurableActorStage(ray_stage_spec={RayStageSpecKeys.RAY_NUM_CPUS: 1.0})
+        kwargs = _map_batches_kwargs(stage)
+        assert kwargs["num_cpus"] == 1.0
+
+    def test_build_resource_kwargs_falls_back_to_resources_cpus_when_ray_num_cpus_absent(self):
+        stage = ConfigurableActorStage()
+        kwargs = _map_batches_kwargs(stage)
+        assert kwargs["num_cpus"] == stage.resources.cpus
+
 
 def _map_batches_kwargs(stage: ProcessingStage) -> dict[str, object]:
     dataset = RecordingDataset()
diff --git a/tests/benchmarking/__init__.py b/tests/benchmarking/__init__.py
diff --git a/tests/benchmarking/runner/__init__.py b/tests/benchmarking/runner/__init__.py
diff --git a/tests/benchmarking/runner/test_session.py b/tests/benchmarking/runner/test_session.py
diff --git a/tests/stages/video/clipping/test_clip_transcoding_stage.py b/tests/stages/video/clipping/test_clip_transcoding_stage.py
diff --git a/tests/stages/video/clipping/test_video_frame_extraction.py b/tests/stages/video/clipping/test_video_frame_extraction.py