NVIDIA-NeMo
diff --git a/‎PR1967_FEATURE_WALKTHROUGH.md‎
Lines changed: 550 additions & 95 deletions b/‎PR1967_FEATURE_WALKTHROUGH.md‎
Lines changed: 550 additions & 95 deletions
diff --git a/‎examples/audio/qwen_omni_inprocess/README.md‎
Lines changed: 28 additions & 0 deletions b/‎examples/audio/qwen_omni_inprocess/README.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎nemo_curator/pipeline/payload_lifecycle.py‎
Lines changed: 157 additions & 1 deletion b/‎nemo_curator/pipeline/payload_lifecycle.py‎
Lines changed: 157 additions & 1 deletion
diff --git a/‎nemo_curator/pipeline/pipeline.py‎
Lines changed: 15 additions & 15 deletions b/‎nemo_curator/pipeline/pipeline.py‎
Lines changed: 15 additions & 15 deletions
@@ -0,0 +1,28 @@
+# Qwen-Omni In-Process ASR Assets
+
+This folder contains prompt templates used by the Qwen-Omni in-process ASR
+adapter.
+
+The executable code path is:
+
+```text
+Pipeline
+  -> ManifestReader
+  -> AudioPayloadMaterializeStage
+  -> ASRStage(adapter_target=QwenOmniASRAdapter)
+  -> PayloadReleaseStage
+  -> ManifestWriterStage
+```
+
+The adapter reads prompt text through `prompt_file`, `en_prompt_file`,
+`followup_prompt_file`, or `system_prompt_file`. Curator stage behavior remains
+outside the prompt files:
+
+- graph expansion lives in `nemo_curator/pipeline/payload_lifecycle.py`;
+- audio decode and payload refs live in `nemo_curator/stages/payload_lifecycle.py`;
+- local/windowed ASR model-input segmentation and batching live in
+  `nemo_curator/stages/audio/inference/asr/stage.py`;
+- Qwen/vLLM request construction lives in `nemo_curator/models/asr/qwen_omni.py`.
+
+Prompt files may use `{language}` and `{transcript}` placeholders when the
+stage supplies language or reference text columns.
@@ -82,23 +82,36 @@ def expand_payload_lifecycle_stages(
 
     reader = _last_manifest_reader(stages[: materialize_idx + 1])
     payload_specs = _payload_binding_specs(payload_cfg, stages=stages, consumers=consumers, reader=reader)
+    _configure_planned_source_segment_inputs(reader, payload_cfg, payload_specs, config)
     _validate_payload_consumers(consumers, payload_specs)
+    _validate_single_segment_planner_owner(
+        reader,
+        consumers,
+        config=config,
+    )
 
-    materializers = [_build_payload_materializer(reader, spec, payload_cfg, config, run_id=run_id) for spec in payload_specs]
+    materializers = [
+        _build_payload_materializer(reader, spec, payload_cfg, config, run_id=run_id)
+        for spec in payload_specs
+    ]
     primary_spec = payload_specs[0]
     release = payload_release_stage_cls(
         name=str(payload_cfg.get("release_stage_name", "payload_release")),
         payload_ref_key=primary_spec.ref_key,
         waveform_key=primary_spec.waveform_key,
     )
 
+    assembler = _post_release_payload_lifecycle_stage(config, reader, consumers, primary_spec, run_id=run_id)
+
     expanded: list[ProcessingStage] = []
     for idx, stage in enumerate(stages):
         expanded.append(stage)
         if idx == materialize_idx:
             expanded.extend(materializers)
         if idx == release_idx:
             expanded.append(release)
+            if assembler is not None:
+                expanded.append(assembler)
     logger.info("Expanded logical graph into payload lifecycle execution graph: {}", " -> ".join(stage.name for stage in expanded))
     return expanded
 
@@ -249,6 +262,108 @@ def _stage_payload_bindings(stage: ProcessingStage) -> list[dict[str, str]]:
     return []
 
 
+def _configure_planned_source_segment_inputs(
+    reader: ProcessingStage | None,
+    payload_cfg: dict[str, Any],
+    payload_specs: list[PayloadBindingSpec],
+    config: Any,
+) -> None:
+    if reader is None or not bool(getattr(reader, "enable_global_bucketing", False)):
+        return
+    scheduler_cfg = _config_section(config, "global_audio_scheduler")
+    configured = scheduler_cfg.get("segment_input_keys", payload_cfg.get("segment_input_keys"))
+    segment_input_keys: list[str] = []
+    if configured is not None:
+        segment_input_keys.extend(_normalise_string_list(configured, key="global_audio_scheduler.segment_input_keys"))
+    segment_input_keys.extend(spec.source_key for spec in payload_specs)
+    setattr(reader, "segment_input_keys", _dedupe_strings(segment_input_keys))
+    setattr(reader, "run_id", _pipeline_run_id(config))
+    if "parent_store_actor_name_prefix" in scheduler_cfg:
+        setattr(reader, "parent_store_actor_name_prefix", str(scheduler_cfg["parent_store_actor_name_prefix"]))
+
+
+def _validate_single_segment_planner_owner(
+    reader: ProcessingStage | None,
+    consumers: list[ProcessingStage],
+    *,
+    config: Any,
+) -> None:
+    if reader is None or not bool(getattr(reader, "enable_global_bucketing", False)):
+        return
+    owner_stage = _single_selector(getattr(reader, "owner_stage", None), key="global_audio_scheduler.owner_stage")
+    matching_consumers = [stage for stage in consumers if owner_stage in _stage_match_idents(stage)]
+    if not matching_consumers:
+        available = sorted({ident for stage in consumers for ident in _stage_match_idents(stage)})
+        msg = (
+            "global_audio_scheduler.owner_stage must select exactly one stage listed in "
+            "payload_lifecycle.consumers. Global bucketing has a single planning owner; "
+            f"{owner_stage!r} was not found in payload consumers {available}."
+        )
+        raise ValueError(msg)
+    if len(matching_consumers) > 1:
+        names = [stage.name for stage in matching_consumers]
+        msg = f"global_audio_scheduler.owner_stage must select exactly one payload consumer; matched {names}"
+        raise ValueError(msg)
+    _validate_planner_owner_has_largest_model_window(reader=reader, owner=matching_consumers[0], consumers=consumers)
+    setattr(reader, "owner_stage", owner_stage)
+
+
+def _validate_planner_owner_has_largest_model_window(
+    *,
+    reader: ProcessingStage,
+    owner: ProcessingStage,
+    consumers: list[ProcessingStage],
+) -> None:
+    owner_max_s = _required_positive_seconds(owner, "max_inference_duration_s")
+    consumer_max_s = [(stage.name, _required_positive_seconds(stage, "max_inference_duration_s")) for stage in consumers]
+    larger_consumers = [(name, max_s) for name, max_s in consumer_max_s if max_s > owner_max_s]
+    if larger_consumers:
+        details = ", ".join(f"{name}={value:g}s" for name, value in larger_consumers)
+        msg = (
+            "global_audio_scheduler.owner_stage must select the payload consumer with the largest "
+            "max_inference_duration_s because the source planner emits one segment plan. "
+            f"Selected owner {owner.name!r} has max_inference_duration_s={owner_max_s:g}s, "
+            f"but larger consumer(s) exist: {details}."
+        )
+        raise ValueError(msg)
+
+    reader_max_s = _required_positive_seconds(reader, "max_inference_duration_s")
+    if abs(reader_max_s - owner_max_s) > 1e-6:
+        msg = (
+            "ManifestReader(enable_global_bucketing=True).max_inference_duration_s must match the "
+            "selected owner stage's max_inference_duration_s. "
+            f"Reader has {reader_max_s:g}s, owner {owner.name!r} has {owner_max_s:g}s."
+        )
+        raise ValueError(msg)
+
+
+def _required_positive_seconds(stage: ProcessingStage, attr: str) -> float:
+    value = getattr(stage, attr, None)
+    if value is None:
+        msg = f"Global bucketing requires stage {stage.name!r} to define positive {attr}"
+        raise ValueError(msg)
+    return _positive_seconds(value, label=f"{stage.name}.{attr}")
+
+
+def _optional_positive_seconds(stage: ProcessingStage, attr: str) -> float | None:
+    value = getattr(stage, attr, None)
+    if value is None:
+        return None
+    return _positive_seconds(value, label=f"{stage.name}.{attr}")
+
+
+def _positive_seconds(value: Any, *, label: str) -> float:
+    try:
+        seconds = float(value)
+    except (TypeError, ValueError) as exc:
+        msg = f"{label} must be a positive number of seconds, got {value!r}"
+        raise TypeError(msg) from exc
+    if seconds <= 0:
+        msg = f"{label} must be > 0 seconds, got {seconds:g}"
+        raise ValueError(msg)
+    return seconds
+
+
 def _build_payload_materializer(
     reader: ProcessingStage | None,
     spec: PayloadBindingSpec,
@@ -274,6 +389,33 @@ def _build_payload_materializer(
     )
 
 
+def _post_release_payload_lifecycle_stage(
+    config: Any,
+    reader: ProcessingStage | None,
+    consumers: list[ProcessingStage],
+    primary_spec: PayloadBindingSpec,
+    *,
+    run_id: str,
+) -> ProcessingStage | None:
+    if reader is None or not bool(getattr(reader, "enable_global_bucketing", False)):
+        return None
+    builder = getattr(reader, "build_payload_lifecycle_post_release_stage", None)
+    if not callable(builder):
+        msg = (
+            "Global bucketing is enabled, but the source/reader stage does not provide "
+            "build_payload_lifecycle_post_release_stage(). The central payload lifecycle "
+            "planner only owns generic insertion order; modality-specific assembly must be "
+            f"provided by the planner stage, got {type(reader).__name__}."
+        )
+        raise ValueError(msg)
+    return builder(
+        pipeline_config=config,
+        consumers=consumers,
+        primary_payload_spec=primary_spec,
+        run_id=run_id,
+    )
+
+
 def _pipeline_run_id(config: Any) -> str:
     value = _config_get(config, "_curator_pipeline_run_id")
     text = str(value or "").strip()
@@ -352,6 +494,20 @@ def _normalise_string_list(value: Any, *, key: str) -> list[str]:
     return result
 
 
+def _dedupe_strings(values: list[str]) -> list[str]:
+    result: list[str] = []
+    seen: set[str] = set()
+    for value in values:
+        text = str(value).strip()
+        if text and text not in seen:
+            seen.add(text)
+            result.append(text)
+    if not result:
+        msg = "At least one non-empty string is required"
+        raise ValueError(msg)
+    return result
+
+
 def _single_selector(value: Any, *, key: str) -> str:
     values = _normalise_string_list(value, key=key)
     if len(values) != 1:
 
@@ -136,20 +136,26 @@ def build(self) -> None:
         self._built = True
         self._planned_stage_snapshot = list(self.stages)
 
+    def _expand_pipeline_graph(self, stages: list[ProcessingStage]) -> list[ProcessingStage]:
+        """Apply generic pipeline-level graph expansion rules."""
+        from nemo_curator.pipeline.payload_lifecycle import expand_payload_lifecycle_stages
+
+        return expand_payload_lifecycle_stages(stages, self.config)
+
     def _sync_public_stage_mutations(self) -> None:
-        """Respect direct ``pipeline.stages`` edits made through the public API.
+        """Preserve the historical public ``stages`` list mutation behavior.
 
-        The logical/execution split keeps graph expansion idempotent, but
-        ``stages`` is still a public list in Curator. If user code mutates that
-        list directly, treat it as the new logical graph before planning.
+        ``_logical_stages`` is the canonical source for graph expansion, but
+        existing user code may still mutate ``pipeline.stages`` directly. Treat
+        those mutations as logical graph edits before planning instead of
+        silently ignoring them.
         """
         if self._built:
             if self.stages == self._planned_stage_snapshot:
                 return
             logger.warning(
-                "Pipeline '{}' execution-stage list was modified after build(); treating the current stages "
-                "as the new logical graph",
-                self.name,
+                "Pipeline.stages was mutated after build(); treating the current public stages list "
+                "as the new logical graph. Prefer Pipeline.add_stage() for future code."
             )
             self._clear_default_source_sink_roles()
             self._logical_stages = list(self.stages)
@@ -159,19 +165,13 @@ def _sync_public_stage_mutations(self) -> None:
 
         if self.stages != self._logical_stages:
             logger.warning(
-                "Pipeline '{}' stages list was modified directly; syncing it into the logical graph",
-                self.name,
+                "Pipeline.stages was mutated directly; syncing it into the logical graph. "
+                "Prefer Pipeline.add_stage() for future code."
             )
             self._clear_default_source_sink_roles()
             self._logical_stages = list(self.stages)
             self._planned_stage_snapshot = []
 
-    def _expand_pipeline_graph(self, stages: list[ProcessingStage]) -> list[ProcessingStage]:
-        """Apply generic pipeline-level graph expansion rules."""
-        from nemo_curator.pipeline.payload_lifecycle import expand_payload_lifecycle_stages
-
-        return expand_payload_lifecycle_stages(stages, self.config)
-
     def _clear_default_source_sink_roles(self) -> None:
         """Clear source/sink roles that were assigned by a previous build.